postgresql/src/backend/utils/adt/ascii.c

199 lines
4.6 KiB
C
Raw Normal View History

/*-----------------------------------------------------------------------
* ascii.c
* The PostgreSQL routine for string to ascii conversion.
*
* Portions Copyright (c) 1999-2020, PostgreSQL Global Development Group
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/utils/adt/ascii.c
*
*-----------------------------------------------------------------------
*/
#include "postgres.h"
#include "mb/pg_wchar.h"
#include "utils/ascii.h"
#include "utils/builtins.h"
static void pg_to_ascii(unsigned char *src, unsigned char *src_end,
unsigned char *dest, int enc);
static text *encode_to_ascii(text *data, int enc);
/* ----------
2001-03-22 05:01:46 +01:00
* to_ascii
* ----------
*/
static void
pg_to_ascii(unsigned char *src, unsigned char *src_end, unsigned char *dest, int enc)
{
unsigned char *x;
const unsigned char *ascii;
int range;
2001-03-22 05:01:46 +01:00
/*
* relevant start for an encoding
*/
#define RANGE_128 128
#define RANGE_160 160
Commit Karel's patch. ------------------------------------------------------------------- Subject: Re: [PATCHES] encoding names From: Karel Zak <zakkr@zf.jcu.cz> To: Peter Eisentraut <peter_e@gmx.net> Cc: pgsql-patches <pgsql-patches@postgresql.org> Date: Fri, 31 Aug 2001 17:24:38 +0200 On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote: > > - convert encoding 'name' to 'id' > > I thought we decided not to add functions returning "new" names until we > know exactly what the new names should be, and pending schema Ok, the patch not to add functions. > better > > ...(): encoding name too long Fixed. I found new bug in command/variable.c in parse_client_encoding(), nobody probably never see this error: if (pg_set_client_encoding(encoding)) { elog(ERROR, "Conversion between %s and %s is not supported", value, GetDatabaseEncodingName()); } because pg_set_client_encoding() returns -1 for error and 0 as true. It's fixed too. IMHO it can be apply. Karel PS: * following files are renamed: src/utils/mb/Unicode/KOI8_to_utf8.map --> src/utils/mb/Unicode/koi8r_to_utf8.map src/utils/mb/Unicode/WIN_to_utf8.map --> src/utils/mb/Unicode/win1251_to_utf8.map src/utils/mb/Unicode/utf8_to_KOI8.map --> src/utils/mb/Unicode/utf8_to_koi8r.map src/utils/mb/Unicode/utf8_to_WIN.map --> src/utils/mb/Unicode/utf8_to_win1251.map * new file: src/utils/mb/encname.c * removed file: src/utils/mb/common.c -- Karel Zak <zakkr@zf.jcu.cz> http://home.zf.jcu.cz/~zakkr/ C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
if (enc == PG_LATIN1)
{
/*
* ISO-8859-1 <range: 160 -- 255>
*/
ascii = (const unsigned char *) " cL Y \"Ca -R 'u ., ?AAAAAAACEEEEIIII NOOOOOxOUUUUYTBaaaaaaaceeeeiiii nooooo/ouuuuyty";
range = RANGE_160;
}
Commit Karel's patch. ------------------------------------------------------------------- Subject: Re: [PATCHES] encoding names From: Karel Zak <zakkr@zf.jcu.cz> To: Peter Eisentraut <peter_e@gmx.net> Cc: pgsql-patches <pgsql-patches@postgresql.org> Date: Fri, 31 Aug 2001 17:24:38 +0200 On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote: > > - convert encoding 'name' to 'id' > > I thought we decided not to add functions returning "new" names until we > know exactly what the new names should be, and pending schema Ok, the patch not to add functions. > better > > ...(): encoding name too long Fixed. I found new bug in command/variable.c in parse_client_encoding(), nobody probably never see this error: if (pg_set_client_encoding(encoding)) { elog(ERROR, "Conversion between %s and %s is not supported", value, GetDatabaseEncodingName()); } because pg_set_client_encoding() returns -1 for error and 0 as true. It's fixed too. IMHO it can be apply. Karel PS: * following files are renamed: src/utils/mb/Unicode/KOI8_to_utf8.map --> src/utils/mb/Unicode/koi8r_to_utf8.map src/utils/mb/Unicode/WIN_to_utf8.map --> src/utils/mb/Unicode/win1251_to_utf8.map src/utils/mb/Unicode/utf8_to_KOI8.map --> src/utils/mb/Unicode/utf8_to_koi8r.map src/utils/mb/Unicode/utf8_to_WIN.map --> src/utils/mb/Unicode/utf8_to_win1251.map * new file: src/utils/mb/encname.c * removed file: src/utils/mb/common.c -- Karel Zak <zakkr@zf.jcu.cz> http://home.zf.jcu.cz/~zakkr/ C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
else if (enc == PG_LATIN2)
{
/*
* ISO-8859-2 <range: 160 -- 255>
*/
ascii = (const unsigned char *) " A L LS \"SSTZ-ZZ a,l'ls ,sstz\"zzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTBraaaalccceeeeiiddnnoooo/ruuuuyt.";
range = RANGE_160;
}
else if (enc == PG_LATIN9)
{
/*
* ISO-8859-15 <range: 160 -- 255>
*/
ascii = (const unsigned char *) " cL YS sCa -R Zu .z EeY?AAAAAAACEEEEIIII NOOOOOxOUUUUYTBaaaaaaaceeeeiiii nooooo/ouuuuyty";
range = RANGE_160;
}
Commit Karel's patch. ------------------------------------------------------------------- Subject: Re: [PATCHES] encoding names From: Karel Zak <zakkr@zf.jcu.cz> To: Peter Eisentraut <peter_e@gmx.net> Cc: pgsql-patches <pgsql-patches@postgresql.org> Date: Fri, 31 Aug 2001 17:24:38 +0200 On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote: > > - convert encoding 'name' to 'id' > > I thought we decided not to add functions returning "new" names until we > know exactly what the new names should be, and pending schema Ok, the patch not to add functions. > better > > ...(): encoding name too long Fixed. I found new bug in command/variable.c in parse_client_encoding(), nobody probably never see this error: if (pg_set_client_encoding(encoding)) { elog(ERROR, "Conversion between %s and %s is not supported", value, GetDatabaseEncodingName()); } because pg_set_client_encoding() returns -1 for error and 0 as true. It's fixed too. IMHO it can be apply. Karel PS: * following files are renamed: src/utils/mb/Unicode/KOI8_to_utf8.map --> src/utils/mb/Unicode/koi8r_to_utf8.map src/utils/mb/Unicode/WIN_to_utf8.map --> src/utils/mb/Unicode/win1251_to_utf8.map src/utils/mb/Unicode/utf8_to_KOI8.map --> src/utils/mb/Unicode/utf8_to_koi8r.map src/utils/mb/Unicode/utf8_to_WIN.map --> src/utils/mb/Unicode/utf8_to_win1251.map * new file: src/utils/mb/encname.c * removed file: src/utils/mb/common.c -- Karel Zak <zakkr@zf.jcu.cz> http://home.zf.jcu.cz/~zakkr/ C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
else if (enc == PG_WIN1250)
{
/*
* Window CP1250 <range: 128 -- 255>
*/
ascii = (const unsigned char *) " ' \" %S<STZZ `'\"\".-- s>stzz L A \"CS -RZ ,l'u .,as L\"lzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTBraaaalccceeeeiiddnnoooo/ruuuuyt ";
range = RANGE_128;
}
else
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2005-10-15 04:49:52 +02:00
errmsg("encoding conversion from %s to ASCII not supported",
pg_encoding_to_char(enc))));
return; /* keep compiler quiet */
}
2001-03-22 05:01:46 +01:00
/*
* Encode
*/
for (x = src; x < src_end; x++)
{
2001-03-22 05:01:46 +01:00
if (*x < 128)
*dest++ = *x;
else if (*x < range)
*dest++ = ' '; /* bogus 128 to 'range' */
else
*dest++ = ascii[*x - range];
2001-03-22 05:01:46 +01:00
}
}
/* ----------
* encode text
*
* The text datum is overwritten in-place, therefore this coding method
* cannot support conversions that change the string length!
* ----------
*/
static text *
encode_to_ascii(text *data, int enc)
{
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
pg_to_ascii((unsigned char *) VARDATA(data), /* src */
(unsigned char *) (data) + VARSIZE(data), /* src end */
(unsigned char *) VARDATA(data), /* dest */
2001-03-22 05:01:46 +01:00
enc); /* encoding */
return data;
}
/* ----------
* convert to ASCII - enc is set as 'name' arg.
* ----------
*/
Datum
to_ascii_encname(PG_FUNCTION_ARGS)
{
2003-08-04 02:43:34 +02:00
text *data = PG_GETARG_TEXT_P_COPY(0);
Fix the inadvertent libpq ABI breakage discovered by Martin Pitt: the renumbering of encoding IDs done between 8.2 and 8.3 turns out to break 8.2 initdb and psql if they are run with an 8.3beta1 libpq.so. For the moment we can rearrange the order of enum pg_enc to keep the same number for everything except PG_JOHAB, which isn't a problem since there are no direct references to it in the 8.2 programs anyway. (This does force initdb unfortunately.) Going forward, we want to fix things so that encoding IDs can be changed without an ABI break, and this commit includes the changes needed to allow libpq's encoding IDs to be treated as fully independent of the backend's. The main issue is that libpq clients should not include pg_wchar.h or otherwise assume they know the specific values of libpq's encoding IDs, since they might encounter version skew between pg_wchar.h and the libpq.so they are using. To fix, have libpq officially export functions needed for encoding name<=>ID conversion and validity checking; it was doing this anyway unofficially. It's still the case that we can't renumber backend encoding IDs until the next bump in libpq's major version number, since doing so will break the 8.2-era client programs. However the code is now prepared to avoid this type of problem in future. Note that initdb is no longer a libpq client: we just pull in the two source files we need directly. The patch also fixes a few places that were being sloppy about checking for an unrecognized encoding name.
2007-10-13 22:18:42 +02:00
char *encname = NameStr(*PG_GETARG_NAME(1));
int enc = pg_char_to_encoding(encname);
if (enc < 0)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("%s is not a valid encoding name", encname)));
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
}
/* ----------
* convert to ASCII - enc is set as int4
* ----------
*/
2001-03-22 05:01:46 +01:00
Datum
to_ascii_enc(PG_FUNCTION_ARGS)
{
2003-08-04 02:43:34 +02:00
text *data = PG_GETARG_TEXT_P_COPY(0);
int enc = PG_GETARG_INT32(1);
Fix the inadvertent libpq ABI breakage discovered by Martin Pitt: the renumbering of encoding IDs done between 8.2 and 8.3 turns out to break 8.2 initdb and psql if they are run with an 8.3beta1 libpq.so. For the moment we can rearrange the order of enum pg_enc to keep the same number for everything except PG_JOHAB, which isn't a problem since there are no direct references to it in the 8.2 programs anyway. (This does force initdb unfortunately.) Going forward, we want to fix things so that encoding IDs can be changed without an ABI break, and this commit includes the changes needed to allow libpq's encoding IDs to be treated as fully independent of the backend's. The main issue is that libpq clients should not include pg_wchar.h or otherwise assume they know the specific values of libpq's encoding IDs, since they might encounter version skew between pg_wchar.h and the libpq.so they are using. To fix, have libpq officially export functions needed for encoding name<=>ID conversion and validity checking; it was doing this anyway unofficially. It's still the case that we can't renumber backend encoding IDs until the next bump in libpq's major version number, since doing so will break the 8.2-era client programs. However the code is now prepared to avoid this type of problem in future. Note that initdb is no longer a libpq client: we just pull in the two source files we need directly. The patch also fixes a few places that were being sloppy about checking for an unrecognized encoding name.
2007-10-13 22:18:42 +02:00
if (!PG_VALID_ENCODING(enc))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("%d is not a valid encoding code", enc)));
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
}
/* ----------
* convert to ASCII - current enc is DatabaseEncoding
* ----------
*/
Datum
to_ascii_default(PG_FUNCTION_ARGS)
{
2003-08-04 02:43:34 +02:00
text *data = PG_GETARG_TEXT_P_COPY(0);
int enc = GetDatabaseEncoding();
PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
}
/* ----------
* Copy a string in an arbitrary backend-safe encoding, converting it to a
* valid ASCII string by replacing non-ASCII bytes with '?'. Otherwise the
* behavior is identical to strlcpy(), except that we don't bother with a
* return value.
*
* This must not trigger ereport(ERROR), as it is called in postmaster.
* ----------
*/
void
ascii_safe_strlcpy(char *dest, const char *src, size_t destsiz)
{
if (destsiz == 0) /* corner case: no room for trailing nul */
return;
while (--destsiz > 0)
{
/* use unsigned char here to avoid compiler warning */
unsigned char ch = *src++;
if (ch == '\0')
break;
/* Keep printable ASCII characters */
if (32 <= ch && ch <= 127)
*dest = ch;
/* White-space is also OK */
else if (ch == '\n' || ch == '\r' || ch == '\t')
*dest = ch;
/* Everything else is replaced with '?' */
else
*dest = '?';
dest++;
}
*dest = '\0';
}