2003-07-14 18:41:38 +02:00
|
|
|
/*-----------------------------------------------------------------------
|
2000-08-05 16:59:29 +02:00
|
|
|
* ascii.c
|
2003-07-14 18:41:38 +02:00
|
|
|
* The PostgreSQL routine for string to ascii conversion.
|
2000-08-05 16:59:29 +02:00
|
|
|
*
|
2020-01-01 18:21:45 +01:00
|
|
|
* Portions Copyright (c) 1999-2020, PostgreSQL Global Development Group
|
2000-08-05 16:59:29 +02:00
|
|
|
*
|
2003-07-14 18:41:38 +02:00
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/utils/adt/ascii.c
|
2000-08-05 16:59:29 +02:00
|
|
|
*
|
2003-07-14 18:41:38 +02:00
|
|
|
*-----------------------------------------------------------------------
|
2000-08-05 16:59:29 +02:00
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
2003-07-14 18:41:38 +02:00
|
|
|
|
2000-08-05 16:59:29 +02:00
|
|
|
#include "mb/pg_wchar.h"
|
|
|
|
#include "utils/ascii.h"
|
2016-12-28 18:00:00 +01:00
|
|
|
#include "utils/builtins.h"
|
2000-08-05 16:59:29 +02:00
|
|
|
|
2003-07-14 18:41:38 +02:00
|
|
|
static void pg_to_ascii(unsigned char *src, unsigned char *src_end,
|
2019-05-22 19:04:48 +02:00
|
|
|
unsigned char *dest, int enc);
|
2000-08-05 16:59:29 +02:00
|
|
|
static text *encode_to_ascii(text *data, int enc);
|
|
|
|
|
2003-07-14 18:41:38 +02:00
|
|
|
|
2000-08-05 16:59:29 +02:00
|
|
|
/* ----------
|
2001-03-22 05:01:46 +01:00
|
|
|
* to_ascii
|
2000-08-05 16:59:29 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
2003-07-14 18:41:38 +02:00
|
|
|
static void
|
|
|
|
pg_to_ascii(unsigned char *src, unsigned char *src_end, unsigned char *dest, int enc)
|
2000-08-05 16:59:29 +02:00
|
|
|
{
|
2003-04-02 23:07:59 +02:00
|
|
|
unsigned char *x;
|
2004-12-20 20:00:37 +01:00
|
|
|
const unsigned char *ascii;
|
2003-04-02 23:07:59 +02:00
|
|
|
int range;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* relevant start for an encoding
|
|
|
|
*/
|
|
|
|
#define RANGE_128 128
|
|
|
|
#define RANGE_160 160
|
|
|
|
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
|
|
|
if (enc == PG_LATIN1)
|
2000-08-05 16:59:29 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-08-05 16:59:29 +02:00
|
|
|
* ISO-8859-1 <range: 160 -- 255>
|
|
|
|
*/
|
2005-09-24 19:53:28 +02:00
|
|
|
ascii = (const unsigned char *) " cL Y \"Ca -R 'u ., ?AAAAAAACEEEEIIII NOOOOOxOUUUUYTBaaaaaaaceeeeiiii nooooo/ouuuuyty";
|
2000-08-05 16:59:29 +02:00
|
|
|
range = RANGE_160;
|
|
|
|
}
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
|
|
|
else if (enc == PG_LATIN2)
|
2000-08-05 16:59:29 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-08-05 16:59:29 +02:00
|
|
|
* ISO-8859-2 <range: 160 -- 255>
|
|
|
|
*/
|
2005-09-24 19:53:28 +02:00
|
|
|
ascii = (const unsigned char *) " A L LS \"SSTZ-ZZ a,l'ls ,sstz\"zzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTBraaaalccceeeeiiddnnoooo/ruuuuyt.";
|
2000-08-05 16:59:29 +02:00
|
|
|
range = RANGE_160;
|
|
|
|
}
|
2004-12-20 20:00:37 +01:00
|
|
|
else if (enc == PG_LATIN9)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* ISO-8859-15 <range: 160 -- 255>
|
|
|
|
*/
|
2005-09-24 19:53:28 +02:00
|
|
|
ascii = (const unsigned char *) " cL YS sCa -R Zu .z EeY?AAAAAAACEEEEIIII NOOOOOxOUUUUYTBaaaaaaaceeeeiiii nooooo/ouuuuyty";
|
2004-12-20 20:00:37 +01:00
|
|
|
range = RANGE_160;
|
|
|
|
}
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
|
|
|
else if (enc == PG_WIN1250)
|
2000-08-05 16:59:29 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-08-05 16:59:29 +02:00
|
|
|
* Window CP1250 <range: 128 -- 255>
|
|
|
|
*/
|
2005-09-24 19:53:28 +02:00
|
|
|
ascii = (const unsigned char *) " ' \" %S<STZZ `'\"\".-- s>stzz L A \"CS -RZ ,l'u .,as L\"lzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTBraaaalccceeeeiiddnnoooo/ruuuuyt ";
|
2000-08-05 16:59:29 +02:00
|
|
|
range = RANGE_128;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
2005-10-15 04:49:52 +02:00
|
|
|
errmsg("encoding conversion from %s to ASCII not supported",
|
|
|
|
pg_encoding_to_char(enc))));
|
2003-07-14 18:41:38 +02:00
|
|
|
return; /* keep compiler quiet */
|
2000-08-05 16:59:29 +02:00
|
|
|
}
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-08-05 16:59:29 +02:00
|
|
|
* Encode
|
|
|
|
*/
|
2003-04-02 23:07:59 +02:00
|
|
|
for (x = src; x < src_end; x++)
|
2000-08-05 16:59:29 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
if (*x < 128)
|
2003-07-14 18:41:38 +02:00
|
|
|
*dest++ = *x;
|
2000-08-05 16:59:29 +02:00
|
|
|
else if (*x < range)
|
2003-07-14 18:41:38 +02:00
|
|
|
*dest++ = ' '; /* bogus 128 to 'range' */
|
2000-08-05 16:59:29 +02:00
|
|
|
else
|
2003-07-14 18:41:38 +02:00
|
|
|
*dest++ = ascii[*x - range];
|
2001-03-22 05:01:46 +01:00
|
|
|
}
|
2000-08-05 16:59:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* encode text
|
2003-07-14 18:41:38 +02:00
|
|
|
*
|
|
|
|
* The text datum is overwritten in-place, therefore this coding method
|
|
|
|
* cannot support conversions that change the string length!
|
2000-08-05 16:59:29 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
static text *
|
|
|
|
encode_to_ascii(text *data, int enc)
|
|
|
|
{
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
pg_to_ascii((unsigned char *) VARDATA(data), /* src */
|
|
|
|
(unsigned char *) (data) + VARSIZE(data), /* src end */
|
|
|
|
(unsigned char *) VARDATA(data), /* dest */
|
2001-03-22 05:01:46 +01:00
|
|
|
enc); /* encoding */
|
|
|
|
|
2000-08-05 16:59:29 +02:00
|
|
|
return data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* convert to ASCII - enc is set as 'name' arg.
|
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
to_ascii_encname(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2003-08-04 02:43:34 +02:00
|
|
|
text *data = PG_GETARG_TEXT_P_COPY(0);
|
2007-10-13 22:18:42 +02:00
|
|
|
char *encname = NameStr(*PG_GETARG_NAME(1));
|
|
|
|
int enc = pg_char_to_encoding(encname);
|
|
|
|
|
|
|
|
if (enc < 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_UNDEFINED_OBJECT),
|
|
|
|
errmsg("%s is not a valid encoding name", encname)));
|
2003-07-14 18:41:38 +02:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
|
2000-08-05 16:59:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* convert to ASCII - enc is set as int4
|
|
|
|
* ----------
|
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
Datum
|
2000-08-05 16:59:29 +02:00
|
|
|
to_ascii_enc(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2003-08-04 02:43:34 +02:00
|
|
|
text *data = PG_GETARG_TEXT_P_COPY(0);
|
|
|
|
int enc = PG_GETARG_INT32(1);
|
2003-07-14 18:41:38 +02:00
|
|
|
|
2007-10-13 22:18:42 +02:00
|
|
|
if (!PG_VALID_ENCODING(enc))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_UNDEFINED_OBJECT),
|
|
|
|
errmsg("%d is not a valid encoding code", enc)));
|
|
|
|
|
2003-07-14 18:41:38 +02:00
|
|
|
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
|
2000-08-05 16:59:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* convert to ASCII - current enc is DatabaseEncoding
|
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
to_ascii_default(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2003-08-04 02:43:34 +02:00
|
|
|
text *data = PG_GETARG_TEXT_P_COPY(0);
|
|
|
|
int enc = GetDatabaseEncoding();
|
2003-07-14 18:41:38 +02:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
|
2000-08-05 16:59:29 +02:00
|
|
|
}
|
2011-10-21 19:26:40 +02:00
|
|
|
|
|
|
|
/* ----------
|
2011-10-21 22:36:04 +02:00
|
|
|
* Copy a string in an arbitrary backend-safe encoding, converting it to a
|
|
|
|
* valid ASCII string by replacing non-ASCII bytes with '?'. Otherwise the
|
|
|
|
* behavior is identical to strlcpy(), except that we don't bother with a
|
|
|
|
* return value.
|
2011-10-21 19:26:40 +02:00
|
|
|
*
|
2011-10-21 22:36:04 +02:00
|
|
|
* This must not trigger ereport(ERROR), as it is called in postmaster.
|
2011-10-21 19:26:40 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
void
|
2011-10-21 22:36:04 +02:00
|
|
|
ascii_safe_strlcpy(char *dest, const char *src, size_t destsiz)
|
2011-10-21 19:26:40 +02:00
|
|
|
{
|
2011-10-21 22:36:04 +02:00
|
|
|
if (destsiz == 0) /* corner case: no room for trailing nul */
|
|
|
|
return;
|
2011-10-21 19:26:40 +02:00
|
|
|
|
2011-10-21 22:36:04 +02:00
|
|
|
while (--destsiz > 0)
|
2011-10-21 19:26:40 +02:00
|
|
|
{
|
2011-10-21 22:36:04 +02:00
|
|
|
/* use unsigned char here to avoid compiler warning */
|
|
|
|
unsigned char ch = *src++;
|
2011-10-21 19:26:40 +02:00
|
|
|
|
|
|
|
if (ch == '\0')
|
|
|
|
break;
|
|
|
|
/* Keep printable ASCII characters */
|
|
|
|
if (32 <= ch && ch <= 127)
|
2011-10-21 22:36:04 +02:00
|
|
|
*dest = ch;
|
2011-10-21 19:26:40 +02:00
|
|
|
/* White-space is also OK */
|
|
|
|
else if (ch == '\n' || ch == '\r' || ch == '\t')
|
2011-10-21 22:36:04 +02:00
|
|
|
*dest = ch;
|
2011-10-21 19:26:40 +02:00
|
|
|
/* Everything else is replaced with '?' */
|
|
|
|
else
|
2011-10-21 22:36:04 +02:00
|
|
|
*dest = '?';
|
|
|
|
dest++;
|
2011-10-21 19:26:40 +02:00
|
|
|
}
|
|
|
|
|
2011-10-21 22:36:04 +02:00
|
|
|
*dest = '\0';
|
2011-10-21 19:26:40 +02:00
|
|
|
}
|