2020-01-16 21:58:24 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* wchar.c
|
|
|
|
* Functions for working with multibyte characters in various encodings.
|
|
|
|
*
|
2023-01-02 21:00:37 +01:00
|
|
|
* Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
|
2020-01-16 21:58:24 +01:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/common/wchar.c
|
1999-07-12 00:47:21 +02:00
|
|
|
*
|
2020-01-16 21:58:24 +01:00
|
|
|
*-------------------------------------------------------------------------
|
1998-03-15 08:39:04 +01:00
|
|
|
*/
|
2020-01-17 00:08:21 +01:00
|
|
|
#include "c.h"
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
|
|
|
|
2001-09-21 17:27:38 +02:00
|
|
|
#include "mb/pg_wchar.h"
|
|
|
|
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
|
|
|
|
1998-03-15 08:39:04 +01:00
|
|
|
/*
|
2019-01-25 13:54:38 +01:00
|
|
|
* Operations on multi-byte encodings are driven by a table of helper
|
|
|
|
* functions.
|
|
|
|
*
|
2021-01-28 13:40:07 +01:00
|
|
|
* To add an encoding support, define mblen(), dsplen(), verifychar() and
|
|
|
|
* verifystr() for the encoding. For server-encodings, also define mb2wchar()
|
|
|
|
* and wchar2mb() conversion functions.
|
2006-02-10 01:39:04 +01:00
|
|
|
*
|
2006-05-21 22:05:21 +02:00
|
|
|
* These functions generally assume that their input is validly formed.
|
|
|
|
* The "verifier" functions, further down in the file, have to be more
|
2019-01-25 13:54:38 +01:00
|
|
|
* paranoid.
|
|
|
|
*
|
|
|
|
* We expect that mblen() does not need to examine more than the first byte
|
|
|
|
* of the character to discover the correct length. GB18030 is an exception
|
|
|
|
* to that rule, though, as it also looks at second byte. But even that
|
|
|
|
* behaves in a predictable way, if you only pass the first byte: it will
|
|
|
|
* treat 4-byte encoded characters as two 2-byte encoded characters, which is
|
|
|
|
* good enough for all current uses.
|
2006-05-21 22:05:21 +02:00
|
|
|
*
|
2006-02-10 01:39:04 +01:00
|
|
|
* Note: for the display output of psql to work properly, the return values
|
2006-05-21 22:05:21 +02:00
|
|
|
* of the dsplen functions must conform to the Unicode standard. In particular
|
2006-02-10 01:39:04 +01:00
|
|
|
* the NUL character is zero width and control characters are generally
|
|
|
|
* width -1. It is recommended that non-ASCII encodings refer their ASCII
|
2006-05-21 22:05:21 +02:00
|
|
|
* subset to the ASCII routines to ensure consistency.
|
1998-03-15 08:39:04 +01:00
|
|
|
*/
|
1998-08-25 06:19:16 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* SQL/ASCII
|
|
|
|
*/
|
2005-12-24 17:49:48 +01:00
|
|
|
static int
|
2007-10-16 00:46:27 +02:00
|
|
|
pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-08-25 06:19:16 +02:00
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
1998-08-25 06:19:16 +02:00
|
|
|
{
|
|
|
|
*to++ = *from++;
|
|
|
|
len--;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
1998-08-25 06:19:16 +02:00
|
|
|
}
|
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-08-25 06:19:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_ascii_mblen(const unsigned char *s)
|
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return 1;
|
1998-08-25 06:19:16 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_ascii_dsplen(const unsigned char *s)
|
|
|
|
{
|
2006-02-10 01:39:04 +01:00
|
|
|
if (*s == '\0')
|
|
|
|
return 0;
|
|
|
|
if (*s < 0x20 || *s == 0x7f)
|
|
|
|
return -1;
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2005-12-24 17:49:48 +01:00
|
|
|
return 1;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-08-25 06:19:16 +02:00
|
|
|
/*
|
|
|
|
* EUC
|
|
|
|
*/
|
2007-10-16 00:46:27 +02:00
|
|
|
static int
|
|
|
|
pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 10:35:36 +01:00
|
|
|
if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
|
|
|
|
* KANA") */
|
1998-09-01 06:40:42 +02:00
|
|
|
{
|
1998-03-15 08:39:04 +01:00
|
|
|
from++;
|
2005-12-24 10:35:36 +01:00
|
|
|
*to = (SS2 << 8) | *from++;
|
2001-03-08 01:24:34 +01:00
|
|
|
len -= 2;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
2005-12-24 10:35:36 +01:00
|
|
|
else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
from++;
|
2005-12-24 10:35:36 +01:00
|
|
|
*to = (SS3 << 16) | (*from++ << 8);
|
|
|
|
*to |= *from++;
|
1998-06-16 09:29:54 +02:00
|
|
|
len -= 3;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
*to = *from++ << 8;
|
|
|
|
*to |= *from++;
|
1998-06-16 09:29:54 +02:00
|
|
|
len -= 2;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
2012-07-10 21:58:36 +02:00
|
|
|
else /* must be ASCII */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
*to = *from++;
|
1998-06-16 09:29:54 +02:00
|
|
|
len--;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
to++;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
static inline int
|
1998-06-16 09:29:54 +02:00
|
|
|
pg_euc_mblen(const unsigned char *s)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
1998-06-16 09:29:54 +02:00
|
|
|
int len;
|
|
|
|
|
|
|
|
if (*s == SS2)
|
|
|
|
len = 2;
|
|
|
|
else if (*s == SS3)
|
|
|
|
len = 3;
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
1998-06-16 09:29:54 +02:00
|
|
|
len = 2;
|
|
|
|
else
|
|
|
|
len = 1;
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
static inline int
|
2004-03-15 11:41:26 +01:00
|
|
|
pg_euc_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (*s == SS2)
|
|
|
|
len = 2;
|
|
|
|
else if (*s == SS3)
|
|
|
|
len = 2;
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
2004-03-15 11:41:26 +01:00
|
|
|
len = 2;
|
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s);
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-03-15 08:39:04 +01:00
|
|
|
/*
|
1998-06-16 09:29:54 +02:00
|
|
|
* EUC_JP
|
1998-03-15 08:39:04 +01:00
|
|
|
*/
|
2007-10-16 00:46:27 +02:00
|
|
|
static int
|
|
|
|
pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc2wchar_with_len(from, to, len);
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
static int
|
|
|
|
pg_eucjp_mblen(const unsigned char *s)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc_mblen(s);
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_eucjp_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (*s == SS2)
|
|
|
|
len = 1;
|
|
|
|
else if (*s == SS3)
|
|
|
|
len = 2;
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
2004-03-15 11:41:26 +01:00
|
|
|
len = 2;
|
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s);
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-03-15 08:39:04 +01:00
|
|
|
/*
|
1998-06-16 09:29:54 +02:00
|
|
|
* EUC_KR
|
1998-03-15 08:39:04 +01:00
|
|
|
*/
|
2007-10-16 00:46:27 +02:00
|
|
|
static int
|
|
|
|
pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc2wchar_with_len(from, to, len);
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
static int
|
|
|
|
pg_euckr_mblen(const unsigned char *s)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc_mblen(s);
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_euckr_dsplen(const unsigned char *s)
|
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc_dsplen(s);
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
/*
|
|
|
|
* EUC_CN
|
2005-12-24 10:35:36 +01:00
|
|
|
*
|
1998-06-16 09:29:54 +02:00
|
|
|
*/
|
2007-10-16 00:46:27 +02:00
|
|
|
static int
|
|
|
|
pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 10:35:36 +01:00
|
|
|
if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
|
1998-09-01 06:40:42 +02:00
|
|
|
{
|
1998-03-15 08:39:04 +01:00
|
|
|
from++;
|
2005-12-24 10:35:36 +01:00
|
|
|
*to = (SS2 << 16) | (*from++ << 8);
|
|
|
|
*to |= *from++;
|
2001-03-08 01:24:34 +01:00
|
|
|
len -= 3;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
2012-04-24 04:43:09 +02:00
|
|
|
else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
from++;
|
2005-12-24 10:35:36 +01:00
|
|
|
*to = (SS3 << 16) | (*from++ << 8);
|
|
|
|
*to |= *from++;
|
1998-03-15 08:39:04 +01:00
|
|
|
len -= 3;
|
|
|
|
}
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
*to = *from++ << 8;
|
|
|
|
*to |= *from++;
|
|
|
|
len -= 2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*to = *from++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
to++;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
static int
|
|
|
|
pg_euccn_mblen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-25 03:14:19 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
1998-06-16 09:29:54 +02:00
|
|
|
len = 2;
|
|
|
|
else
|
|
|
|
len = 1;
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1998-06-16 09:29:54 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_euccn_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-25 03:14:19 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2004-03-15 11:41:26 +01:00
|
|
|
len = 2;
|
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s);
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
/*
|
|
|
|
* EUC_TW
|
2005-12-24 10:35:36 +01:00
|
|
|
*
|
1998-06-16 09:29:54 +02:00
|
|
|
*/
|
2007-10-16 00:46:27 +02:00
|
|
|
static int
|
|
|
|
pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2005-12-24 10:35:36 +01:00
|
|
|
if (*from == SS2 && len >= 4) /* code set 2 */
|
1998-09-01 06:40:42 +02:00
|
|
|
{
|
1998-03-15 08:39:04 +01:00
|
|
|
from++;
|
2007-07-12 23:17:09 +02:00
|
|
|
*to = (((uint32) SS2) << 24) | (*from++ << 16);
|
1998-03-15 08:39:04 +01:00
|
|
|
*to |= *from++ << 8;
|
|
|
|
*to |= *from++;
|
2001-03-08 01:24:34 +01:00
|
|
|
len -= 4;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
2005-12-24 10:35:36 +01:00
|
|
|
else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
from++;
|
2005-12-24 10:35:36 +01:00
|
|
|
*to = (SS3 << 16) | (*from++ << 8);
|
|
|
|
*to |= *from++;
|
1998-03-15 08:39:04 +01:00
|
|
|
len -= 3;
|
|
|
|
}
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
*to = *from++ << 8;
|
|
|
|
*to |= *from++;
|
|
|
|
len -= 2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*to = *from++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
to++;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
static int
|
|
|
|
pg_euctw_mblen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (*s == SS2)
|
|
|
|
len = 4;
|
|
|
|
else if (*s == SS3)
|
|
|
|
len = 3;
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
1998-06-16 09:29:54 +02:00
|
|
|
len = 2;
|
|
|
|
else
|
2006-05-21 22:05:21 +02:00
|
|
|
len = 1;
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1998-06-16 09:29:54 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_euctw_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (*s == SS2)
|
|
|
|
len = 2;
|
|
|
|
else if (*s == SS3)
|
|
|
|
len = 2;
|
2005-12-25 03:14:19 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
2004-03-15 11:41:26 +01:00
|
|
|
len = 2;
|
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s);
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
2012-07-04 23:10:10 +02:00
|
|
|
/*
|
2012-07-10 21:58:36 +02:00
|
|
|
* Convert pg_wchar to EUC_* encoding.
|
2012-07-04 23:10:10 +02:00
|
|
|
* caller must allocate enough space for "to", including a trailing zero!
|
|
|
|
* len: length of from.
|
|
|
|
* "from" not necessarily null terminated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
|
|
{
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
while (len > 0 && *from)
|
|
|
|
{
|
|
|
|
unsigned char c;
|
2012-07-10 21:58:36 +02:00
|
|
|
|
|
|
|
if ((c = (*from >> 24)))
|
2012-07-04 23:10:10 +02:00
|
|
|
{
|
|
|
|
*to++ = c;
|
|
|
|
*to++ = (*from >> 16) & 0xff;
|
|
|
|
*to++ = (*from >> 8) & 0xff;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 4;
|
|
|
|
}
|
2012-07-10 21:58:36 +02:00
|
|
|
else if ((c = (*from >> 16)))
|
2012-07-04 23:10:10 +02:00
|
|
|
{
|
|
|
|
*to++ = c;
|
|
|
|
*to++ = (*from >> 8) & 0xff;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 3;
|
|
|
|
}
|
2012-07-10 21:58:36 +02:00
|
|
|
else if ((c = (*from >> 8)))
|
2012-07-04 23:10:10 +02:00
|
|
|
{
|
|
|
|
*to++ = c;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*to++ = *from;
|
|
|
|
cnt++;
|
2012-07-10 21:58:36 +02:00
|
|
|
}
|
2012-07-06 05:47:53 +02:00
|
|
|
from++;
|
2012-07-04 23:10:10 +02:00
|
|
|
len--;
|
|
|
|
}
|
|
|
|
*to = 0;
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2002-03-05 06:52:50 +01:00
|
|
|
/*
|
|
|
|
* JOHAB
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_johab_mblen(const unsigned char *s)
|
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc_mblen(s);
|
2002-03-05 06:52:50 +01:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_johab_dsplen(const unsigned char *s)
|
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return pg_euc_dsplen(s);
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-03-15 08:39:04 +01:00
|
|
|
/*
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
* convert UTF8 string to pg_wchar (UCS-4)
|
|
|
|
* caller must allocate enough space for "to", including a trailing zero!
|
1998-03-15 08:39:04 +01:00
|
|
|
* len: length of from.
|
|
|
|
* "from" not necessarily null terminated.
|
|
|
|
*/
|
2000-08-27 12:40:48 +02:00
|
|
|
static int
|
1998-03-15 08:39:04 +01:00
|
|
|
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
uint32 c1,
|
|
|
|
c2,
|
|
|
|
c3,
|
|
|
|
c4;
|
1998-09-01 06:40:42 +02:00
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
if ((*from & 0x80) == 0)
|
1998-09-01 06:40:42 +02:00
|
|
|
{
|
1998-03-15 08:39:04 +01:00
|
|
|
*to = *from++;
|
|
|
|
len--;
|
|
|
|
}
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
else if ((*from & 0xe0) == 0xc0)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
if (len < 2)
|
|
|
|
break; /* drop trailing incomplete char */
|
1998-03-15 08:39:04 +01:00
|
|
|
c1 = *from++ & 0x1f;
|
|
|
|
c2 = *from++ & 0x3f;
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
*to = (c1 << 6) | c2;
|
2001-03-08 01:24:34 +01:00
|
|
|
len -= 2;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
else if ((*from & 0xf0) == 0xe0)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
if (len < 3)
|
|
|
|
break; /* drop trailing incomplete char */
|
1998-03-15 08:39:04 +01:00
|
|
|
c1 = *from++ & 0x0f;
|
|
|
|
c2 = *from++ & 0x3f;
|
|
|
|
c3 = *from++ & 0x3f;
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
*to = (c1 << 12) | (c2 << 6) | c3;
|
2001-03-08 01:24:34 +01:00
|
|
|
len -= 3;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
else if ((*from & 0xf8) == 0xf0)
|
|
|
|
{
|
|
|
|
if (len < 4)
|
|
|
|
break; /* drop trailing incomplete char */
|
|
|
|
c1 = *from++ & 0x07;
|
|
|
|
c2 = *from++ & 0x3f;
|
|
|
|
c3 = *from++ & 0x3f;
|
|
|
|
c4 = *from++ & 0x3f;
|
|
|
|
*to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
|
|
|
|
len -= 4;
|
|
|
|
}
|
1999-04-25 22:35:51 +02:00
|
|
|
else
|
|
|
|
{
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
/* treat a bogus char as length 1; not ours to raise error */
|
1999-04-25 22:35:51 +02:00
|
|
|
*to = *from++;
|
|
|
|
len--;
|
|
|
|
}
|
1998-03-15 08:39:04 +01:00
|
|
|
to++;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
2008-10-29 09:04:54 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
|
|
|
|
* space allocated.
|
|
|
|
*/
|
|
|
|
unsigned char *
|
|
|
|
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
|
|
|
|
{
|
|
|
|
if (c <= 0x7F)
|
|
|
|
{
|
|
|
|
utf8string[0] = c;
|
|
|
|
}
|
|
|
|
else if (c <= 0x7FF)
|
|
|
|
{
|
|
|
|
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
|
|
|
|
utf8string[1] = 0x80 | (c & 0x3F);
|
|
|
|
}
|
|
|
|
else if (c <= 0xFFFF)
|
|
|
|
{
|
|
|
|
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
|
|
|
|
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
|
|
|
|
utf8string[2] = 0x80 | (c & 0x3F);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
|
|
|
|
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
|
|
|
|
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
|
|
|
|
utf8string[3] = 0x80 | (c & 0x3F);
|
|
|
|
}
|
|
|
|
|
|
|
|
return utf8string;
|
|
|
|
}
|
|
|
|
|
2012-07-04 23:10:10 +02:00
|
|
|
/*
|
|
|
|
* Trivial conversion from pg_wchar to UTF-8.
|
|
|
|
* caller should allocate enough space for "to"
|
|
|
|
* len: length of from.
|
|
|
|
* "from" not necessarily null terminated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
|
|
{
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
while (len > 0 && *from)
|
|
|
|
{
|
|
|
|
int char_len;
|
2012-07-10 21:58:36 +02:00
|
|
|
|
2012-07-04 23:10:10 +02:00
|
|
|
unicode_to_utf8(*from, to);
|
|
|
|
char_len = pg_utf_mblen(to);
|
|
|
|
cnt += char_len;
|
|
|
|
to += char_len;
|
2012-07-06 05:47:53 +02:00
|
|
|
from++;
|
|
|
|
len--;
|
2012-07-04 23:10:10 +02:00
|
|
|
}
|
|
|
|
*to = 0;
|
|
|
|
return cnt;
|
|
|
|
}
|
2008-10-29 09:04:54 +01:00
|
|
|
|
2000-10-12 08:06:50 +02:00
|
|
|
/*
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
* Return the byte length of a UTF8 character pointed to by s
|
|
|
|
*
|
|
|
|
* Note: in the current implementation we do not support UTF8 sequences
|
|
|
|
* of more than 4 bytes; hence do NOT return a value larger than 4.
|
|
|
|
* We return "1" for any leading byte that is either flat-out illegal or
|
|
|
|
* indicates a length larger than we support.
|
|
|
|
*
|
2010-08-18 21:54:01 +02:00
|
|
|
* pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
* other places would need to be fixed to change this.
|
2000-10-12 08:06:50 +02:00
|
|
|
*/
|
|
|
|
int
|
2004-12-03 02:20:33 +01:00
|
|
|
pg_utf_mblen(const unsigned char *s)
|
1998-06-16 09:29:54 +02:00
|
|
|
{
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
int len;
|
1998-06-16 09:29:54 +02:00
|
|
|
|
|
|
|
if ((*s & 0x80) == 0)
|
|
|
|
len = 1;
|
|
|
|
else if ((*s & 0xe0) == 0xc0)
|
|
|
|
len = 2;
|
2005-06-15 02:15:08 +02:00
|
|
|
else if ((*s & 0xf0) == 0xe0)
|
|
|
|
len = 3;
|
|
|
|
else if ((*s & 0xf8) == 0xf0)
|
|
|
|
len = 4;
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
#ifdef NOT_USED
|
2005-06-15 02:15:08 +02:00
|
|
|
else if ((*s & 0xfc) == 0xf8)
|
|
|
|
len = 5;
|
|
|
|
else if ((*s & 0xfe) == 0xfc)
|
|
|
|
len = 6;
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
#endif
|
|
|
|
else
|
|
|
|
len = 1;
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1998-06-16 09:29:54 +02:00
|
|
|
}
|
|
|
|
|
2006-02-10 01:39:04 +01:00
|
|
|
/*
|
|
|
|
* This is an implementation of wcwidth() and wcswidth() as defined in
|
|
|
|
* "The Single UNIX Specification, Version 2, The Open Group, 1997"
|
2019-10-08 07:31:30 +02:00
|
|
|
* <http://www.unix.org/online.html>
|
2006-02-10 01:39:04 +01:00
|
|
|
*
|
|
|
|
* Markus Kuhn -- 2001-09-08 -- public domain
|
|
|
|
*
|
|
|
|
* customised for PostgreSQL
|
|
|
|
*
|
|
|
|
* original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct mbinterval
|
|
|
|
{
|
2021-08-26 16:53:56 +02:00
|
|
|
unsigned int first;
|
|
|
|
unsigned int last;
|
2006-02-10 01:39:04 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/* auxiliary function for binary search in interval table */
|
2021-08-26 15:58:28 +02:00
|
|
|
static int
|
2006-02-10 01:39:04 +01:00
|
|
|
mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
|
|
|
|
{
|
|
|
|
int min = 0;
|
|
|
|
int mid;
|
|
|
|
|
|
|
|
if (ucs < table[0].first || ucs > table[max].last)
|
2021-08-26 15:58:28 +02:00
|
|
|
return 0;
|
2006-02-10 01:39:04 +01:00
|
|
|
while (max >= min)
|
|
|
|
{
|
|
|
|
mid = (min + max) / 2;
|
|
|
|
if (ucs > table[mid].last)
|
|
|
|
min = mid + 1;
|
|
|
|
else if (ucs < table[mid].first)
|
|
|
|
max = mid - 1;
|
|
|
|
else
|
2021-08-26 15:58:28 +02:00
|
|
|
return 1;
|
2006-02-10 01:39:04 +01:00
|
|
|
}
|
|
|
|
|
2021-08-26 15:58:28 +02:00
|
|
|
return 0;
|
2006-02-10 01:39:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* The following functions define the column width of an ISO 10646
|
|
|
|
* character as follows:
|
|
|
|
*
|
|
|
|
* - The null character (U+0000) has a column width of 0.
|
|
|
|
*
|
|
|
|
* - Other C0/C1 control characters and DEL will lead to a return
|
|
|
|
* value of -1.
|
|
|
|
*
|
|
|
|
* - Non-spacing and enclosing combining characters (general
|
2022-09-13 11:13:33 +02:00
|
|
|
* category code Mn, Me or Cf in the Unicode database) have a
|
2006-02-10 01:39:04 +01:00
|
|
|
* column width of 0.
|
|
|
|
*
|
|
|
|
* - Spacing characters in the East Asian Wide (W) or East Asian
|
|
|
|
* FullWidth (F) category as defined in Unicode Technical
|
|
|
|
* Report #11 have a column width of 2.
|
|
|
|
*
|
|
|
|
* - All remaining characters (including all printable
|
|
|
|
* ISO 8859-1 and WGL4 characters, Unicode control characters,
|
|
|
|
* etc.) have a column width of 1.
|
|
|
|
*
|
|
|
|
* This implementation assumes that wchar_t characters are encoded
|
|
|
|
* in ISO 10646.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
ucs_wcwidth(pg_wchar ucs)
|
|
|
|
{
|
2022-09-13 11:13:33 +02:00
|
|
|
#include "common/unicode_nonspacing_table.h"
|
2021-08-26 16:53:56 +02:00
|
|
|
#include "common/unicode_east_asian_fw_table.h"
|
2006-02-10 01:39:04 +01:00
|
|
|
|
|
|
|
/* test for 8-bit control characters */
|
|
|
|
if (ucs == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
|
|
|
|
return -1;
|
|
|
|
|
2021-08-26 16:53:56 +02:00
|
|
|
/*
|
|
|
|
* binary search in table of non-spacing characters
|
|
|
|
*
|
|
|
|
* XXX: In the official Unicode sources, it is possible for a character to
|
|
|
|
* be described as both non-spacing and wide at the same time. As of
|
|
|
|
* Unicode 13.0, treating the non-spacing property as the determining
|
|
|
|
* factor for display width leads to the correct behavior, so do that
|
|
|
|
* search first.
|
|
|
|
*/
|
2022-09-13 11:13:33 +02:00
|
|
|
if (mbbisearch(ucs, nonspacing,
|
|
|
|
sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
|
2021-08-26 15:58:28 +02:00
|
|
|
return 0;
|
2006-02-10 01:39:04 +01:00
|
|
|
|
2021-08-26 16:53:56 +02:00
|
|
|
/* binary search in table of wide characters */
|
|
|
|
if (mbbisearch(ucs, east_asian_fw,
|
|
|
|
sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
|
|
|
|
return 2;
|
2006-02-10 01:39:04 +01:00
|
|
|
|
2021-08-26 16:53:56 +02:00
|
|
|
return 1;
|
2006-02-10 01:39:04 +01:00
|
|
|
}
|
|
|
|
|
2010-08-18 21:54:01 +02:00
|
|
|
/*
|
|
|
|
* Convert a UTF-8 character to a Unicode code point.
|
|
|
|
* This is a one-character version of pg_utf2wchar_with_len.
|
|
|
|
*
|
|
|
|
* No error checks here, c must point to a long-enough string.
|
|
|
|
*/
|
|
|
|
pg_wchar
|
|
|
|
utf8_to_unicode(const unsigned char *c)
|
2006-02-10 01:39:04 +01:00
|
|
|
{
|
|
|
|
if ((*c & 0x80) == 0)
|
|
|
|
return (pg_wchar) c[0];
|
|
|
|
else if ((*c & 0xe0) == 0xc0)
|
|
|
|
return (pg_wchar) (((c[0] & 0x1f) << 6) |
|
|
|
|
(c[1] & 0x3f));
|
|
|
|
else if ((*c & 0xf0) == 0xe0)
|
|
|
|
return (pg_wchar) (((c[0] & 0x0f) << 12) |
|
|
|
|
((c[1] & 0x3f) << 6) |
|
|
|
|
(c[2] & 0x3f));
|
Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same
page about the maximum UTF8 sequence length we support (4 bytes since 8.1,
3 before that). pg_utf2wchar_with_len never got updated to support 4-byte
characters at all, and in any case had a buffer-overrun risk in that it
could produce multiple pg_wchars from what mblen claims to be just one UTF8
character. The only reason we don't have a major security hole is that most
callers allocate worst-case output buffers; the sole exception in released
versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed
due to zeroing out its return address --- but AFAICS that can't be exploited
for anything more than a crash, due to inability to control what gets written
there. Per report from James Russell and Michael Fuhr.
Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's
behavior given an incomplete final character risks buffer overrun, so
back-patch that logic change anyway.
This patch also makes sure that UTF8 sequences exceeding the supported
length (whichever it is) are consistently treated as error cases, rather
than being treated like a valid shorter sequence in some places.
2007-01-24 18:12:17 +01:00
|
|
|
else if ((*c & 0xf8) == 0xf0)
|
2006-02-10 01:39:04 +01:00
|
|
|
return (pg_wchar) (((c[0] & 0x07) << 18) |
|
|
|
|
((c[1] & 0x3f) << 12) |
|
|
|
|
((c[2] & 0x3f) << 6) |
|
|
|
|
(c[3] & 0x3f));
|
|
|
|
else
|
|
|
|
/* that is an invalid code on purpose */
|
|
|
|
return 0xffffffff;
|
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
2004-12-03 02:20:33 +01:00
|
|
|
pg_utf_dsplen(const unsigned char *s)
|
2004-03-15 11:41:26 +01:00
|
|
|
{
|
2010-08-18 21:54:01 +02:00
|
|
|
return ucs_wcwidth(utf8_to_unicode(s));
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-03-15 08:39:04 +01:00
|
|
|
/*
|
|
|
|
* convert mule internal code to pg_wchar
|
|
|
|
* caller should allocate enough space for "to"
|
|
|
|
* len: length of from.
|
|
|
|
* "from" not necessarily null terminated.
|
|
|
|
*/
|
2000-08-27 12:40:48 +02:00
|
|
|
static int
|
1998-03-15 08:39:04 +01:00
|
|
|
pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
2001-03-08 01:24:34 +01:00
|
|
|
if (IS_LC1(*from) && len >= 2)
|
1998-09-01 06:40:42 +02:00
|
|
|
{
|
1998-03-15 08:39:04 +01:00
|
|
|
*to = *from++ << 16;
|
|
|
|
*to |= *from++;
|
|
|
|
len -= 2;
|
|
|
|
}
|
2001-03-08 01:24:34 +01:00
|
|
|
else if (IS_LCPRV1(*from) && len >= 3)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
from++;
|
|
|
|
*to = *from++ << 16;
|
|
|
|
*to |= *from++;
|
|
|
|
len -= 3;
|
|
|
|
}
|
2001-03-08 01:24:34 +01:00
|
|
|
else if (IS_LC2(*from) && len >= 3)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
*to = *from++ << 16;
|
|
|
|
*to |= *from++ << 8;
|
|
|
|
*to |= *from++;
|
|
|
|
len -= 3;
|
|
|
|
}
|
2001-03-08 01:24:34 +01:00
|
|
|
else if (IS_LCPRV2(*from) && len >= 4)
|
1998-03-15 08:39:04 +01:00
|
|
|
{
|
|
|
|
from++;
|
|
|
|
*to = *from++ << 16;
|
|
|
|
*to |= *from++ << 8;
|
|
|
|
*to |= *from++;
|
|
|
|
len -= 4;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{ /* assume ASCII */
|
|
|
|
*to = (unsigned char) *from++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
to++;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-03-15 08:39:04 +01:00
|
|
|
}
|
|
|
|
|
2012-07-04 23:10:10 +02:00
|
|
|
/*
|
|
|
|
* convert pg_wchar to mule internal code
|
|
|
|
* caller should allocate enough space for "to"
|
|
|
|
* len: length of from.
|
|
|
|
* "from" not necessarily null terminated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
|
|
{
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
while (len > 0 && *from)
|
|
|
|
{
|
2012-07-10 21:58:36 +02:00
|
|
|
unsigned char lb;
|
|
|
|
|
2012-07-04 23:10:10 +02:00
|
|
|
lb = (*from >> 16) & 0xff;
|
|
|
|
if (IS_LC1(lb))
|
|
|
|
{
|
|
|
|
*to++ = lb;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 2;
|
|
|
|
}
|
|
|
|
else if (IS_LC2(lb))
|
|
|
|
{
|
|
|
|
*to++ = lb;
|
|
|
|
*to++ = (*from >> 8) & 0xff;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 3;
|
|
|
|
}
|
|
|
|
else if (IS_LCPRV1_A_RANGE(lb))
|
|
|
|
{
|
|
|
|
*to++ = LCPRV1_A;
|
|
|
|
*to++ = lb;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 3;
|
|
|
|
}
|
|
|
|
else if (IS_LCPRV1_B_RANGE(lb))
|
|
|
|
{
|
|
|
|
*to++ = LCPRV1_B;
|
|
|
|
*to++ = lb;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 3;
|
|
|
|
}
|
|
|
|
else if (IS_LCPRV2_A_RANGE(lb))
|
|
|
|
{
|
|
|
|
*to++ = LCPRV2_A;
|
|
|
|
*to++ = lb;
|
|
|
|
*to++ = (*from >> 8) & 0xff;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 4;
|
|
|
|
}
|
|
|
|
else if (IS_LCPRV2_B_RANGE(lb))
|
|
|
|
{
|
|
|
|
*to++ = LCPRV2_B;
|
|
|
|
*to++ = lb;
|
|
|
|
*to++ = (*from >> 8) & 0xff;
|
|
|
|
*to++ = *from & 0xff;
|
|
|
|
cnt += 4;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2012-07-10 21:58:36 +02:00
|
|
|
*to++ = *from & 0xff;
|
2012-07-04 23:10:10 +02:00
|
|
|
cnt += 1;
|
|
|
|
}
|
2012-07-06 05:47:53 +02:00
|
|
|
from++;
|
2012-07-04 23:10:10 +02:00
|
|
|
len--;
|
|
|
|
}
|
|
|
|
*to = 0;
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
2020-01-17 00:08:21 +01:00
|
|
|
/* exported for direct use by conv.c */
|
1998-07-24 05:32:46 +02:00
|
|
|
int
|
|
|
|
pg_mule_mblen(const unsigned char *s)
|
1998-04-27 19:10:50 +02:00
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
if (IS_LC1(*s))
|
1998-04-27 19:10:50 +02:00
|
|
|
len = 2;
|
1998-06-16 09:29:54 +02:00
|
|
|
else if (IS_LCPRV1(*s))
|
1998-04-27 19:10:50 +02:00
|
|
|
len = 3;
|
1998-06-16 09:29:54 +02:00
|
|
|
else if (IS_LC2(*s))
|
1998-04-27 19:10:50 +02:00
|
|
|
len = 3;
|
1998-06-16 09:29:54 +02:00
|
|
|
else if (IS_LCPRV2(*s))
|
|
|
|
len = 4;
|
|
|
|
else
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* assume ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1998-04-27 19:10:50 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_mule_dsplen(const unsigned char *s)
|
|
|
|
{
|
2006-05-21 22:05:21 +02:00
|
|
|
int len;
|
|
|
|
|
2012-07-04 06:29:57 +02:00
|
|
|
/*
|
|
|
|
* Note: it's not really appropriate to assume that all multibyte charsets
|
|
|
|
* are double-wide on screen. But this seems an okay approximation for
|
|
|
|
* the MULE charsets we currently support.
|
|
|
|
*/
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
if (IS_LC1(*s))
|
|
|
|
len = 1;
|
|
|
|
else if (IS_LCPRV1(*s))
|
|
|
|
len = 1;
|
|
|
|
else if (IS_LC2(*s))
|
|
|
|
len = 2;
|
|
|
|
else if (IS_LCPRV2(*s))
|
|
|
|
len = 2;
|
|
|
|
else
|
|
|
|
len = 1; /* assume ASCII */
|
|
|
|
|
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
/*
|
|
|
|
* ISO8859-1
|
|
|
|
*/
|
2000-08-27 12:40:48 +02:00
|
|
|
static int
|
1998-06-16 09:29:54 +02:00
|
|
|
pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
1998-04-27 19:10:50 +02:00
|
|
|
{
|
2000-08-27 12:40:48 +02:00
|
|
|
int cnt = 0;
|
|
|
|
|
2001-03-08 01:24:34 +01:00
|
|
|
while (len > 0 && *from)
|
2000-08-27 12:40:48 +02:00
|
|
|
{
|
1998-06-16 09:29:54 +02:00
|
|
|
*to++ = *from++;
|
2001-03-08 01:24:34 +01:00
|
|
|
len--;
|
2000-08-27 12:40:48 +02:00
|
|
|
cnt++;
|
|
|
|
}
|
1998-06-16 09:29:54 +02:00
|
|
|
*to = 0;
|
2005-12-24 17:49:48 +01:00
|
|
|
return cnt;
|
1998-04-27 19:10:50 +02:00
|
|
|
}
|
|
|
|
|
2012-07-04 23:10:10 +02:00
|
|
|
/*
|
|
|
|
* Trivial conversion from pg_wchar to single byte encoding. Just ignores
|
|
|
|
* high bits.
|
|
|
|
* caller should allocate enough space for "to"
|
|
|
|
* len: length of from.
|
|
|
|
* "from" not necessarily null terminated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
|
|
{
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
while (len > 0 && *from)
|
|
|
|
{
|
|
|
|
*to++ = *from++;
|
|
|
|
len--;
|
|
|
|
cnt++;
|
|
|
|
}
|
|
|
|
*to = 0;
|
|
|
|
return cnt;
|
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
static int
|
|
|
|
pg_latin1_mblen(const unsigned char *s)
|
1998-04-27 19:10:50 +02:00
|
|
|
{
|
2005-12-24 17:49:48 +01:00
|
|
|
return 1;
|
1998-04-27 19:10:50 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_latin1_dsplen(const unsigned char *s)
|
|
|
|
{
|
2006-02-10 01:39:04 +01:00
|
|
|
return pg_ascii_dsplen(s);
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
/*
|
|
|
|
* SJIS
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_sjis_mblen(const unsigned char *s)
|
1998-04-27 19:10:50 +02:00
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
1998-06-16 09:29:54 +02:00
|
|
|
if (*s >= 0xa1 && *s <= 0xdf)
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* 1 byte kana? */
|
2005-12-26 20:30:45 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* kanji? */
|
1998-06-16 09:29:54 +02:00
|
|
|
else
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1998-04-27 19:10:50 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_sjis_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
|
|
|
if (*s >= 0xa1 && *s <= 0xdf)
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* 1 byte kana? */
|
2005-12-26 20:30:45 +01:00
|
|
|
else if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* kanji? */
|
2004-03-15 11:41:26 +01:00
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s); /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
1999-02-02 19:51:40 +01:00
|
|
|
/*
|
|
|
|
* Big5
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_big5_mblen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* kanji? */
|
1999-02-02 19:51:40 +01:00
|
|
|
else
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
1999-02-02 19:51:40 +01:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_big5_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* kanji? */
|
2004-03-15 11:41:26 +01:00
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s); /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
2002-03-05 06:52:50 +01:00
|
|
|
/*
|
|
|
|
* GBK
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_gbk_mblen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* kanji? */
|
2002-03-05 06:52:50 +01:00
|
|
|
else
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2002-03-05 06:52:50 +01:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_gbk_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* kanji? */
|
2004-03-15 11:41:26 +01:00
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s); /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
2002-03-05 06:52:50 +01:00
|
|
|
/*
|
|
|
|
* UHC
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
pg_uhc_mblen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* 2byte? */
|
2002-03-05 06:52:50 +01:00
|
|
|
else
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2002-03-05 06:52:50 +01:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_uhc_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 2; /* 2byte? */
|
2004-03-15 11:41:26 +01:00
|
|
|
else
|
2006-02-10 01:39:04 +01:00
|
|
|
len = pg_ascii_dsplen(s); /* should be ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
2002-06-13 10:30:22 +02:00
|
|
|
/*
|
2015-05-15 17:03:54 +02:00
|
|
|
* GB18030
|
|
|
|
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
|
|
|
|
*/
|
2019-01-25 13:54:38 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Unlike all other mblen() functions, this also looks at the second byte of
|
|
|
|
* the input. However, if you only pass the first byte of a multi-byte
|
|
|
|
* string, and \0 as the second byte, this still works in a predictable way:
|
|
|
|
* a 4-byte character will be reported as two 2-byte characters. That's
|
|
|
|
* enough for all current uses, as a client-only encoding. It works that
|
|
|
|
* way, because in any valid 4-byte GB18030-encoded character, the third and
|
|
|
|
* fourth byte look like a 2-byte encoded character, when looked at
|
|
|
|
* separately.
|
|
|
|
*/
|
2002-06-13 10:30:22 +02:00
|
|
|
static int
|
|
|
|
pg_gb18030_mblen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2005-12-26 20:30:45 +01:00
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
2005-12-24 18:19:40 +01:00
|
|
|
len = 1; /* ASCII */
|
2015-05-15 17:03:54 +02:00
|
|
|
else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
|
|
|
|
len = 4;
|
2002-06-13 10:30:22 +02:00
|
|
|
else
|
2015-05-15 17:03:54 +02:00
|
|
|
len = 2;
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2002-06-13 10:30:22 +02:00
|
|
|
}
|
|
|
|
|
2004-03-15 11:41:26 +01:00
|
|
|
static int
|
|
|
|
pg_gb18030_dsplen(const unsigned char *s)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
2006-02-10 01:39:04 +01:00
|
|
|
if (IS_HIGHBIT_SET(*s))
|
2004-03-15 11:41:26 +01:00
|
|
|
len = 2;
|
2006-02-10 01:39:04 +01:00
|
|
|
else
|
|
|
|
len = pg_ascii_dsplen(s); /* ASCII */
|
2005-12-24 17:49:48 +01:00
|
|
|
return len;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
/*
|
|
|
|
*-------------------------------------------------------------------
|
|
|
|
* multibyte sequence validators
|
|
|
|
*
|
2021-01-28 13:40:07 +01:00
|
|
|
* The verifychar functions accept "s", a pointer to the first byte of a
|
|
|
|
* string, and "len", the remaining length of the string. If there is a
|
|
|
|
* validly encoded character beginning at *s, return its length in bytes;
|
|
|
|
* else return -1.
|
2006-05-21 22:05:21 +02:00
|
|
|
*
|
2021-01-28 13:40:07 +01:00
|
|
|
* The verifystr functions also accept "s", a pointer to a string and "len",
|
|
|
|
* the length of the string. They verify the whole string, and return the
|
|
|
|
* number of input bytes (<= len) that are valid. In other words, if the
|
|
|
|
* whole string is valid, verifystr returns "len", otherwise it returns the
|
|
|
|
* byte offset of the first invalid character. The verifystr functions must
|
|
|
|
* test for and reject zeroes in the input.
|
2006-05-21 22:05:21 +02:00
|
|
|
*
|
2021-01-28 13:40:07 +01:00
|
|
|
* The verifychar functions can assume that len > 0 and that *s != '\0', but
|
|
|
|
* they must test for and reject zeroes in any additional bytes of a
|
|
|
|
* multibyte character. Note that this definition allows the function for a
|
|
|
|
* single-byte encoding to be just "return 1".
|
2006-05-21 22:05:21 +02:00
|
|
|
*-------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_ascii_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
1998-06-16 09:29:54 +02:00
|
|
|
|
2021-01-28 13:40:07 +01:00
|
|
|
static int
|
|
|
|
pg_ascii_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *nullpos = memchr(s, 0, len);
|
|
|
|
|
|
|
|
if (nullpos == NULL)
|
|
|
|
return len;
|
|
|
|
else
|
|
|
|
return nullpos - s;
|
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_eucjp_verifychar(const unsigned char *s, int len)
|
1998-06-16 09:29:54 +02:00
|
|
|
{
|
2006-05-21 22:05:21 +02:00
|
|
|
int l;
|
|
|
|
unsigned char c1,
|
|
|
|
c2;
|
|
|
|
|
|
|
|
c1 = *s++;
|
|
|
|
|
|
|
|
switch (c1)
|
|
|
|
{
|
|
|
|
case SS2: /* JIS X 0201 */
|
|
|
|
l = 2;
|
|
|
|
if (l > len)
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (c2 < 0xa1 || c2 > 0xdf)
|
|
|
|
return -1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SS3: /* JIS X 0212 */
|
|
|
|
l = 3;
|
|
|
|
if (l > len)
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
|
|
|
{
|
|
|
|
l = 2;
|
|
|
|
if (l > len)
|
|
|
|
return -1;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c1))
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
/* must be ASCII */
|
|
|
|
{
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return l;
|
1998-06-16 09:29:54 +02:00
|
|
|
}
|
2001-02-11 02:59:22 +01:00
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_eucjp_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_eucjp_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_euckr_verifychar(const unsigned char *s, int len)
|
2001-02-11 02:59:22 +01:00
|
|
|
{
|
2006-05-21 22:05:21 +02:00
|
|
|
int l;
|
|
|
|
unsigned char c1,
|
|
|
|
c2;
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
2001-09-06 06:57:30 +02:00
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
c1 = *s++;
|
|
|
|
|
|
|
|
if (IS_HIGHBIT_SET(c1))
|
|
|
|
{
|
|
|
|
l = 2;
|
|
|
|
if (l > len)
|
|
|
|
return -1;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c1))
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
/* must be ASCII */
|
|
|
|
{
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return l;
|
2001-02-11 02:59:22 +01:00
|
|
|
}
|
2001-09-11 06:50:36 +02:00
|
|
|
|
2021-01-28 13:40:07 +01:00
|
|
|
static int
|
|
|
|
pg_euckr_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_euckr_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
/* EUC-CN byte sequences are exactly same as EUC-KR */
|
2021-01-28 13:40:07 +01:00
|
|
|
#define pg_euccn_verifychar pg_euckr_verifychar
|
|
|
|
#define pg_euccn_verifystr pg_euckr_verifystr
|
2006-05-21 22:05:21 +02:00
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_euctw_verifychar(const unsigned char *s, int len)
|
2004-03-15 11:41:26 +01:00
|
|
|
{
|
2006-05-21 22:05:21 +02:00
|
|
|
int l;
|
|
|
|
unsigned char c1,
|
|
|
|
c2;
|
2004-03-15 11:41:26 +01:00
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
c1 = *s++;
|
|
|
|
|
|
|
|
switch (c1)
|
|
|
|
{
|
|
|
|
case SS2: /* CNS 11643 Plane 1-7 */
|
|
|
|
l = 4;
|
|
|
|
if (l > len)
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (c2 < 0xa1 || c2 > 0xa7)
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SS3: /* unused */
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
|
|
|
|
{
|
|
|
|
l = 2;
|
|
|
|
if (l > len)
|
|
|
|
return -1;
|
|
|
|
/* no further range check on c1? */
|
|
|
|
c2 = *s++;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c2))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
/* must be ASCII */
|
|
|
|
{
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return l;
|
2004-03-15 11:41:26 +01:00
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_euctw_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_euctw_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_johab_verifychar(const unsigned char *s, int len)
|
2001-09-21 17:27:38 +02:00
|
|
|
{
|
2006-05-21 22:05:21 +02:00
|
|
|
int l,
|
|
|
|
mbl;
|
|
|
|
unsigned char c;
|
2001-09-21 17:27:38 +02:00
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
l = mbl = pg_johab_mblen(s);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
return mbl;
|
|
|
|
|
|
|
|
while (--l > 0)
|
|
|
|
{
|
|
|
|
c = *++s;
|
|
|
|
if (!IS_EUC_RANGE_VALID(c))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return mbl;
|
2001-09-21 17:27:38 +02:00
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_johab_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_johab_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_mule_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
int l,
|
|
|
|
mbl;
|
|
|
|
unsigned char c;
|
|
|
|
|
|
|
|
l = mbl = pg_mule_mblen(s);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
while (--l > 0)
|
|
|
|
{
|
|
|
|
c = *++s;
|
|
|
|
if (!IS_HIGHBIT_SET(c))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return mbl;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_mule_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_mule_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_latin1_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_latin1_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *nullpos = memchr(s, 0, len);
|
|
|
|
|
|
|
|
if (nullpos == NULL)
|
|
|
|
return len;
|
|
|
|
else
|
|
|
|
return nullpos - s;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_sjis_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
int l,
|
|
|
|
mbl;
|
|
|
|
unsigned char c1,
|
|
|
|
c2;
|
|
|
|
|
|
|
|
l = mbl = pg_sjis_mblen(s);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (l == 1) /* pg_sjis_mblen already verified it */
|
|
|
|
return mbl;
|
|
|
|
|
|
|
|
c1 = *s++;
|
|
|
|
c2 = *s;
|
|
|
|
if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
|
|
|
|
return -1;
|
|
|
|
return mbl;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_sjis_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_sjis_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_big5_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
int l,
|
|
|
|
mbl;
|
|
|
|
|
|
|
|
l = mbl = pg_big5_mblen(s);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
while (--l > 0)
|
|
|
|
{
|
|
|
|
if (*++s == '\0')
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return mbl;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_big5_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_big5_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_gbk_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
int l,
|
|
|
|
mbl;
|
|
|
|
|
|
|
|
l = mbl = pg_gbk_mblen(s);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
while (--l > 0)
|
|
|
|
{
|
|
|
|
if (*++s == '\0')
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return mbl;
|
|
|
|
}
|
2003-07-27 06:53:12 +02:00
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_gbk_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_gbk_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_uhc_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
|
|
|
int l,
|
|
|
|
mbl;
|
|
|
|
|
|
|
|
l = mbl = pg_uhc_mblen(s);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
while (--l > 0)
|
|
|
|
{
|
|
|
|
if (*++s == '\0')
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return mbl;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_uhc_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_uhc_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_gb18030_verifychar(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
2015-05-15 17:03:54 +02:00
|
|
|
int l;
|
2006-05-21 22:05:21 +02:00
|
|
|
|
2015-05-15 17:03:54 +02:00
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
l = 1; /* ASCII */
|
|
|
|
else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
2015-05-15 17:03:54 +02:00
|
|
|
/* Should be 4-byte, validate remaining bytes */
|
|
|
|
if (*s >= 0x81 && *s <= 0xfe &&
|
|
|
|
*(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
|
|
|
|
*(s + 3) >= 0x30 && *(s + 3) <= 0x39)
|
|
|
|
l = 4;
|
|
|
|
else
|
|
|
|
l = -1;
|
2006-05-21 22:05:21 +02:00
|
|
|
}
|
2015-05-15 17:03:54 +02:00
|
|
|
else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
|
|
|
|
{
|
|
|
|
/* Should be 2-byte, validate */
|
|
|
|
if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
|
|
|
|
(*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
|
|
|
|
l = 2;
|
|
|
|
else
|
|
|
|
l = -1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
l = -1;
|
|
|
|
return l;
|
2006-05-21 22:05:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_gb18030_verifystr(const unsigned char *s, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
2021-01-28 13:40:07 +01:00
|
|
|
const unsigned char *start = s;
|
2006-05-21 22:05:21 +02:00
|
|
|
|
2021-01-28 13:40:07 +01:00
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_gb18030_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
pg_utf8_verifychar(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
if ((*s & 0x80) == 0)
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
return -1;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else if ((*s & 0xe0) == 0xc0)
|
|
|
|
l = 2;
|
|
|
|
else if ((*s & 0xf0) == 0xe0)
|
|
|
|
l = 3;
|
|
|
|
else if ((*s & 0xf8) == 0xf0)
|
|
|
|
l = 4;
|
|
|
|
else
|
|
|
|
l = 1;
|
|
|
|
|
|
|
|
if (l > len)
|
2006-05-21 22:05:21 +02:00
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (!pg_utf8_islegal(s, l))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
return l;
|
|
|
|
}
|
|
|
|
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
/*
|
|
|
|
* The fast path of the UTF-8 verifier uses a deterministic finite automaton
|
|
|
|
* (DFA) for multibyte characters. In a traditional table-driven DFA, the
|
|
|
|
* input byte and current state are used to compute an index into an array of
|
|
|
|
* state transitions. Since the address of the next transition is dependent
|
|
|
|
* on this computation, there is latency in executing the load instruction,
|
|
|
|
* and the CPU is not kept busy.
|
|
|
|
*
|
|
|
|
* Instead, we use a "shift-based" DFA as described by Per Vognsen:
|
|
|
|
*
|
|
|
|
* https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
|
|
|
|
*
|
|
|
|
* In a shift-based DFA, the input byte is an index into array of integers
|
|
|
|
* whose bit pattern encodes the state transitions. To compute the next
|
|
|
|
* state, we simply right-shift the integer by the current state and apply a
|
|
|
|
* mask. In this scheme, the address of the transition only depends on the
|
|
|
|
* input byte, so there is better pipelining.
|
|
|
|
*
|
|
|
|
* The naming convention for states and transitions was adopted from a UTF-8
|
|
|
|
* to UTF-16/32 transcoder, whose table is reproduced below:
|
|
|
|
*
|
|
|
|
* https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
|
|
|
|
*
|
|
|
|
* ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
|
|
|
|
* ==========================================================================
|
|
|
|
* err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
|
|
|
|
* err, err, err, err, err, err, err, err, err, err, err, err, | ERR
|
|
|
|
* |
|
|
|
|
* err, err, END, END, END, err, err, err, err, err, err, err, | CS1
|
|
|
|
* err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
|
|
|
|
* err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
|
|
|
|
* |
|
|
|
|
* err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
|
|
|
|
* err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
|
|
|
|
* |
|
|
|
|
* err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
|
|
|
|
* err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
|
|
|
|
*
|
|
|
|
* In the most straightforward implementation, a shift-based DFA for UTF-8
|
|
|
|
* requires 64-bit integers to encode the transitions, but with an SMT solver
|
|
|
|
* it's possible to find state numbers such that the transitions fit within
|
|
|
|
* 32-bit integers, as Dougall Johnson demonstrated:
|
|
|
|
*
|
|
|
|
* https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
|
|
|
|
*
|
|
|
|
* This packed representation is the reason for the seemingly odd choice of
|
|
|
|
* state values below.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Error */
|
|
|
|
#define ERR 0
|
|
|
|
/* Begin */
|
|
|
|
#define BGN 11
|
|
|
|
/* Continuation states, expect 1/2/3 continuation bytes */
|
|
|
|
#define CS1 16
|
|
|
|
#define CS2 1
|
|
|
|
#define CS3 5
|
2022-01-18 04:53:50 +01:00
|
|
|
/* Partial states, where the first continuation byte has a restricted range */
|
|
|
|
#define P3A 6 /* Lead was E0, check for 3-byte overlong */
|
|
|
|
#define P3B 20 /* Lead was ED, check for surrogate */
|
|
|
|
#define P4A 25 /* Lead was F0, check for 4-byte overlong */
|
|
|
|
#define P4B 30 /* Lead was F4, check for too-large */
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
/* Begin and End are the same state */
|
|
|
|
#define END BGN
|
|
|
|
|
|
|
|
/* the encoded state transitions for the lookup table */
|
|
|
|
|
|
|
|
/* ASCII */
|
|
|
|
#define ASC (END << BGN)
|
|
|
|
/* 2-byte lead */
|
|
|
|
#define L2A (CS1 << BGN)
|
|
|
|
/* 3-byte lead */
|
|
|
|
#define L3A (P3A << BGN)
|
|
|
|
#define L3B (CS2 << BGN)
|
|
|
|
#define L3C (P3B << BGN)
|
|
|
|
/* 4-byte lead */
|
|
|
|
#define L4A (P4A << BGN)
|
|
|
|
#define L4B (CS3 << BGN)
|
|
|
|
#define L4C (P4B << BGN)
|
|
|
|
/* continuation byte */
|
|
|
|
#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
|
|
|
|
#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
|
|
|
|
#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
|
|
|
|
/* invalid byte */
|
|
|
|
#define ILL ERR
|
|
|
|
|
|
|
|
static const uint32 Utf8Transition[256] =
|
|
|
|
{
|
|
|
|
/* ASCII */
|
|
|
|
|
|
|
|
ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
|
|
|
|
|
|
|
|
/* continuation bytes */
|
|
|
|
|
|
|
|
/* 80..8F */
|
|
|
|
CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
|
|
|
|
CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
|
|
|
|
|
|
|
|
/* 90..9F */
|
|
|
|
CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
|
|
|
|
CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
|
|
|
|
|
|
|
|
/* A0..BF */
|
|
|
|
CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
|
|
|
|
CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
|
|
|
|
CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
|
|
|
|
CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
|
|
|
|
|
|
|
|
/* leading bytes */
|
|
|
|
|
|
|
|
/* C0..DF */
|
|
|
|
ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
|
|
|
|
L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
|
|
|
|
L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
|
|
|
|
L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
|
|
|
|
|
|
|
|
/* E0..EF */
|
|
|
|
L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
|
|
|
|
L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
|
|
|
|
|
|
|
|
/* F0..FF */
|
|
|
|
L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
|
|
|
|
ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
utf8_advance(const unsigned char *s, uint32 *state, int len)
|
|
|
|
{
|
|
|
|
/* Note: We deliberately don't check the state's value here. */
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* It's important that the mask value is 31: In most instruction sets,
|
|
|
|
* a shift by a 32-bit operand is understood to be a shift by its mod
|
|
|
|
* 32, so the compiler should elide the mask operation.
|
|
|
|
*/
|
|
|
|
*state = Utf8Transition[*s++] >> (*state & 31);
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
|
|
|
|
*state &= 31;
|
|
|
|
}
|
|
|
|
|
2021-01-28 13:40:07 +01:00
|
|
|
static int
|
|
|
|
pg_utf8_verifystr(const unsigned char *s, int len)
|
|
|
|
{
|
|
|
|
const unsigned char *start = s;
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
const int orig_len = len;
|
|
|
|
uint32 state = BGN;
|
|
|
|
|
|
|
|
/*
|
2022-08-26 10:01:24 +02:00
|
|
|
* With a stride of two vector widths, gcc will unroll the loop. Even if
|
|
|
|
* the compiler can unroll a longer loop, it's not worth it because we
|
|
|
|
* must fall back to the byte-wise algorithm if we find any non-ASCII.
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
*/
|
2022-08-26 10:01:24 +02:00
|
|
|
#define STRIDE_LENGTH (2 * sizeof(Vector8))
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
|
|
|
|
if (len >= STRIDE_LENGTH)
|
|
|
|
{
|
|
|
|
while (len >= STRIDE_LENGTH)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If the chunk is all ASCII, we can skip the full UTF-8 check,
|
|
|
|
* but we must first check for a non-END state, which means the
|
|
|
|
* previous chunk ended in the middle of a multibyte sequence.
|
|
|
|
*/
|
|
|
|
if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
|
|
|
|
utf8_advance(s, &state, STRIDE_LENGTH);
|
|
|
|
|
|
|
|
s += STRIDE_LENGTH;
|
|
|
|
len -= STRIDE_LENGTH;
|
|
|
|
}
|
|
|
|
|
2022-01-18 04:53:50 +01:00
|
|
|
/* The error state persists, so we only need to check for it here. */
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
if (state == ERR)
|
|
|
|
{
|
2022-01-18 04:53:50 +01:00
|
|
|
/*
|
|
|
|
* Start over from the beginning with the slow path so we can
|
|
|
|
* count the valid bytes.
|
|
|
|
*/
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
len = orig_len;
|
|
|
|
s = start;
|
|
|
|
}
|
2022-01-18 04:53:50 +01:00
|
|
|
else if (state != END)
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
{
|
2022-01-18 04:53:50 +01:00
|
|
|
/*
|
|
|
|
* The fast path exited in the middle of a multibyte sequence.
|
|
|
|
* Walk backwards to find the leading byte so that the slow path
|
|
|
|
* can resume checking from there. We must always backtrack at
|
|
|
|
* least one byte, since the current byte could be e.g. an ASCII
|
|
|
|
* byte after a 2-byte lead, which is invalid.
|
|
|
|
*/
|
|
|
|
do
|
|
|
|
{
|
|
|
|
Assert(s > start);
|
|
|
|
s--;
|
|
|
|
len++;
|
|
|
|
Assert(IS_HIGHBIT_SET(*s));
|
|
|
|
} while (pg_utf_mblen(s) <= 1);
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
}
|
|
|
|
}
|
2021-01-28 13:40:07 +01:00
|
|
|
|
Add fast path for validating UTF-8 text
Our previous validator used a traditional algorithm that performed
comparison and branching one byte at a time. It's useful in that
we always know exactly how many bytes we have validated, but that
precision comes at a cost. Input validation can show up prominently
in profiles of COPY FROM, and future improvements to COPY FROM such
as parallelism or faster line parsing will put more pressure on input
validation. Hence, add fast paths for both ASCII and multibyte UTF-8:
Use bitwise operations to check 16 bytes at a time for ASCII. If
that fails, use a "shift-based" DFA on those bytes to handle the
general case, including multibyte. These paths are relatively free
of branches and thus robust against all kinds of byte patterns. With
these algorithms, UTF-8 validation is several times faster, depending
on platform and the input byte distribution.
The previous coding in pg_utf8_verifystr() is retained for short
strings and for when the fast path returns an error.
Review, performance testing, and additional hacking by: Heikki
Linakangas, Vladimir Sitnikov, Amit Khandekar, Thomas Munro, and
Greg Stark
Discussion:
https://www.postgresql.org/message-id/CAFBsxsEV_SzH%2BOLyCiyon%3DiwggSyMh_eF6A3LU2tiWf3Cy2ZQg%40mail.gmail.com
2021-10-19 22:43:14 +02:00
|
|
|
/* check remaining bytes */
|
2021-01-28 13:40:07 +01:00
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
int l;
|
|
|
|
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
|
|
if (!IS_HIGHBIT_SET(*s))
|
|
|
|
{
|
|
|
|
if (*s == '\0')
|
|
|
|
break;
|
|
|
|
l = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
l = pg_utf8_verifychar(s, len);
|
|
|
|
if (l == -1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
s += l;
|
|
|
|
len -= l;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s - start;
|
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
/*
|
|
|
|
* Check for validity of a single UTF-8 encoded character
|
|
|
|
*
|
|
|
|
* This directly implements the rules in RFC3629. The bizarre-looking
|
|
|
|
* restrictions on the second byte are meant to ensure that there isn't
|
|
|
|
* more than one encoding of a given Unicode character point; that is,
|
|
|
|
* you may not use a longer-than-necessary byte sequence with high order
|
|
|
|
* zero bits to represent a character that would fit in fewer bytes.
|
|
|
|
* To do otherwise is to create security hazards (eg, create an apparent
|
|
|
|
* non-ASCII character that decodes to plain ASCII).
|
|
|
|
*
|
|
|
|
* length is assumed to have been obtained by pg_utf_mblen(), and the
|
|
|
|
* caller must have checked that that many bytes are present in the buffer.
|
|
|
|
*/
|
2005-06-15 02:15:08 +02:00
|
|
|
bool
|
|
|
|
pg_utf8_islegal(const unsigned char *source, int length)
|
|
|
|
{
|
|
|
|
unsigned char a;
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2005-06-15 02:15:08 +02:00
|
|
|
switch (length)
|
|
|
|
{
|
|
|
|
default:
|
2006-05-21 22:05:21 +02:00
|
|
|
/* reject lengths 5 and 6 for now */
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
|
|
|
case 4:
|
2006-05-21 22:05:21 +02:00
|
|
|
a = source[3];
|
|
|
|
if (a < 0x80 || a > 0xBF)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
2020-05-13 21:31:14 +02:00
|
|
|
/* FALL THRU */
|
2005-10-15 04:49:52 +02:00
|
|
|
case 3:
|
2006-05-21 22:05:21 +02:00
|
|
|
a = source[2];
|
|
|
|
if (a < 0x80 || a > 0xBF)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
2020-05-13 21:31:14 +02:00
|
|
|
/* FALL THRU */
|
2005-06-15 02:15:08 +02:00
|
|
|
case 2:
|
2006-05-21 22:05:21 +02:00
|
|
|
a = source[1];
|
2005-06-15 02:15:08 +02:00
|
|
|
switch (*source)
|
|
|
|
{
|
|
|
|
case 0xE0:
|
2006-05-21 22:05:21 +02:00
|
|
|
if (a < 0xA0 || a > 0xBF)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
case 0xED:
|
2006-05-21 22:05:21 +02:00
|
|
|
if (a < 0x80 || a > 0x9F)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
case 0xF0:
|
2006-05-21 22:05:21 +02:00
|
|
|
if (a < 0x90 || a > 0xBF)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
case 0xF4:
|
2006-05-21 22:05:21 +02:00
|
|
|
if (a < 0x80 || a > 0x8F)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
2005-10-15 04:49:52 +02:00
|
|
|
break;
|
2005-06-15 02:15:08 +02:00
|
|
|
default:
|
2006-05-21 22:05:21 +02:00
|
|
|
if (a < 0x80 || a > 0xBF)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
2006-05-21 22:05:21 +02:00
|
|
|
break;
|
2005-10-15 04:49:52 +02:00
|
|
|
}
|
2020-05-13 21:31:14 +02:00
|
|
|
/* FALL THRU */
|
2005-06-15 02:15:08 +02:00
|
|
|
case 1:
|
2006-05-21 22:05:21 +02:00
|
|
|
a = *source;
|
|
|
|
if (a >= 0x80 && a < 0xC2)
|
|
|
|
return false;
|
|
|
|
if (a > 0xF4)
|
2005-06-15 02:15:08 +02:00
|
|
|
return false;
|
2006-05-21 22:05:21 +02:00
|
|
|
break;
|
2005-10-15 04:49:52 +02:00
|
|
|
}
|
2005-06-15 02:15:08 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-10-29 20:22:20 +02:00
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
/*
|
|
|
|
*-------------------------------------------------------------------
|
|
|
|
* encoding info table
|
2007-10-16 00:46:27 +02:00
|
|
|
* XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
|
2006-05-21 22:05:21 +02:00
|
|
|
*-------------------------------------------------------------------
|
|
|
|
*/
|
2014-01-18 22:04:11 +01:00
|
|
|
const pg_wchar_tbl pg_wchar_table[] = {
|
2021-01-28 13:40:07 +01:00
|
|
|
{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, /* PG_SQL_ASCII */
|
|
|
|
{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JP */
|
|
|
|
{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, /* PG_EUC_CN */
|
|
|
|
{pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, /* PG_EUC_KR */
|
|
|
|
{pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, /* PG_EUC_TW */
|
|
|
|
{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JIS_2004 */
|
|
|
|
{pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4}, /* PG_UTF8 */
|
|
|
|
{pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4}, /* PG_MULE_INTERNAL */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN1 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN2 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN3 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN4 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN5 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN6 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN7 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN8 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN9 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN10 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1256 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1258 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN866 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN874 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8R */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1251 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1252 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-5 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-6 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-7 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-8 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1250 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1253 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1254 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1255 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1257 */
|
|
|
|
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8U */
|
|
|
|
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, /* PG_SJIS */
|
|
|
|
{0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2}, /* PG_BIG5 */
|
|
|
|
{0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, /* PG_GBK */
|
|
|
|
{0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, /* PG_UHC */
|
|
|
|
{0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4}, /* PG_GB18030 */
|
|
|
|
{0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3}, /* PG_JOHAB */
|
|
|
|
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
|
2006-05-21 22:05:21 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns the byte length of a multibyte character.
|
Fix incautious handling of possibly-miscoded strings in client code.
An incorrectly-encoded multibyte character near the end of a string
could cause various processing loops to run past the string's
terminating NUL, with results ranging from no detectable issue to
a program crash, depending on what happens to be in the following
memory.
This isn't an issue in the server, because we take care to verify
the encoding of strings before doing any interesting processing
on them. However, that lack of care leaked into client-side code
which shouldn't assume that anyone has validated the encoding of
its input.
Although this is certainly a bug worth fixing, the PG security team
elected not to regard it as a security issue, primarily because
any untrusted text should be sanitized by PQescapeLiteral or
the like before being incorporated into a SQL or psql command.
(If an app fails to do so, the same technique can be used to
cause SQL injection, with probably much more dire consequences
than a mere client-program crash.) Those functions were already
made proof against this class of problem, cf CVE-2006-2313.
To fix, invent PQmblenBounded() which is like PQmblen() except it
won't return more than the number of bytes remaining in the string.
In HEAD we can make this a new libpq function, as PQmblen() is.
It seems imprudent to change libpq's API in stable branches though,
so in the back branches define PQmblenBounded as a macro in the files
that need it. (Note that just changing PQmblen's behavior would not
be a good idea; notably, it would completely break the escaping
functions' defense against this exact problem. So we just want a
version for those callers that don't have any better way of handling
this issue.)
Per private report from houjingyi. Back-patch to all supported branches.
2021-06-07 20:15:25 +02:00
|
|
|
*
|
|
|
|
* Caution: when dealing with text that is not certainly valid in the
|
|
|
|
* specified encoding, the result may exceed the actual remaining
|
|
|
|
* string length. Callers that are not prepared to deal with that
|
|
|
|
* should use pg_encoding_mblen_bounded() instead.
|
2006-05-21 22:05:21 +02:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_encoding_mblen(int encoding, const char *mbstr)
|
|
|
|
{
|
2014-03-24 20:59:38 +01:00
|
|
|
return (PG_VALID_ENCODING(encoding) ?
|
2017-09-07 18:06:23 +02:00
|
|
|
pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
|
|
|
|
pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
|
2006-05-21 22:05:21 +02:00
|
|
|
}
|
|
|
|
|
Fix incautious handling of possibly-miscoded strings in client code.
An incorrectly-encoded multibyte character near the end of a string
could cause various processing loops to run past the string's
terminating NUL, with results ranging from no detectable issue to
a program crash, depending on what happens to be in the following
memory.
This isn't an issue in the server, because we take care to verify
the encoding of strings before doing any interesting processing
on them. However, that lack of care leaked into client-side code
which shouldn't assume that anyone has validated the encoding of
its input.
Although this is certainly a bug worth fixing, the PG security team
elected not to regard it as a security issue, primarily because
any untrusted text should be sanitized by PQescapeLiteral or
the like before being incorporated into a SQL or psql command.
(If an app fails to do so, the same technique can be used to
cause SQL injection, with probably much more dire consequences
than a mere client-program crash.) Those functions were already
made proof against this class of problem, cf CVE-2006-2313.
To fix, invent PQmblenBounded() which is like PQmblen() except it
won't return more than the number of bytes remaining in the string.
In HEAD we can make this a new libpq function, as PQmblen() is.
It seems imprudent to change libpq's API in stable branches though,
so in the back branches define PQmblenBounded as a macro in the files
that need it. (Note that just changing PQmblen's behavior would not
be a good idea; notably, it would completely break the escaping
functions' defense against this exact problem. So we just want a
version for those callers that don't have any better way of handling
this issue.)
Per private report from houjingyi. Back-patch to all supported branches.
2021-06-07 20:15:25 +02:00
|
|
|
/*
|
|
|
|
* Returns the byte length of a multibyte character; but not more than
|
|
|
|
* the distance to end of string.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_encoding_mblen_bounded(int encoding, const char *mbstr)
|
|
|
|
{
|
|
|
|
return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
|
|
|
|
}
|
|
|
|
|
2006-05-21 22:05:21 +02:00
|
|
|
/*
|
|
|
|
* Returns the display length of a multibyte character.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_encoding_dsplen(int encoding, const char *mbstr)
|
|
|
|
{
|
2014-03-24 20:59:38 +01:00
|
|
|
return (PG_VALID_ENCODING(encoding) ?
|
2017-09-07 18:06:23 +02:00
|
|
|
pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
|
|
|
|
pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
|
2006-05-21 22:05:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify the first multibyte character of the given string.
|
|
|
|
* Return its byte length if good, -1 if bad. (See comments above for
|
2021-01-28 13:40:07 +01:00
|
|
|
* full details of the mbverifychar API.)
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
|
|
|
|
{
|
|
|
|
return (PG_VALID_ENCODING(encoding) ?
|
|
|
|
pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
|
|
|
|
pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify that a string is valid for the given encoding.
|
|
|
|
* Returns the number of input bytes (<= len) that form a valid string.
|
|
|
|
* (See comments above for full details of the mbverifystr API.)
|
2006-05-21 22:05:21 +02:00
|
|
|
*/
|
|
|
|
int
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
|
2006-05-21 22:05:21 +02:00
|
|
|
{
|
2014-03-24 20:59:38 +01:00
|
|
|
return (PG_VALID_ENCODING(encoding) ?
|
2021-01-28 13:40:07 +01:00
|
|
|
pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
|
|
|
|
pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
|
2006-05-21 22:05:21 +02:00
|
|
|
}
|
2005-06-15 02:15:08 +02:00
|
|
|
|
2001-09-11 06:50:36 +02:00
|
|
|
/*
|
2006-05-21 22:05:21 +02:00
|
|
|
* fetch maximum length of a given encoding
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_encoding_max_length(int encoding)
|
|
|
|
{
|
|
|
|
Assert(PG_VALID_ENCODING(encoding));
|
|
|
|
|
|
|
|
return pg_wchar_table[encoding].maxmblen;
|
|
|
|
}
|