mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-09-30 11:11:16 +02:00
Add direct conversion routines between EUC_TW and Big5.
Conversions between EUC_TW and Big5 were previously implemented by converting the whole input to MIC first, and then from MIC to the target encoding. Implement functions to convert directly between the two. The reason to do this now is that I'm working on a patch that will change the conversion function signature so that if the input is invalid, we convert as much as we can and return the number of bytes successfully converted. That's not possible if we use an intermediary format, because if an error happens in the intermediary -> final conversion, we lose track of the location of the invalid character in the original input. Avoiding the intermediate step makes the conversions faster, too. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/b9e3167f-f84b-7aa4-5738-be578a4db924%40iki.fi
This commit is contained in:
parent
b80e10638e
commit
6c5576075b
@ -37,6 +37,8 @@ PG_FUNCTION_INFO_V1(mic_to_big5);
|
||||
* ----------
|
||||
*/
|
||||
|
||||
static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len);
|
||||
static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len);
|
||||
static void big52mic(const unsigned char *big5, unsigned char *p, int len);
|
||||
static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
|
||||
static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
|
||||
@ -48,14 +50,10 @@ euc_tw_to_big5(PG_FUNCTION_ARGS)
|
||||
unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
|
||||
unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
|
||||
int len = PG_GETARG_INT32(4);
|
||||
unsigned char *buf;
|
||||
|
||||
CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
|
||||
|
||||
buf = palloc(len * ENCODING_GROWTH_RATE + 1);
|
||||
euc_tw2mic(src, buf, len);
|
||||
mic2big5(buf, dest, strlen((char *) buf));
|
||||
pfree(buf);
|
||||
euc_tw2big5(src, dest, len);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
@ -66,14 +64,10 @@ big5_to_euc_tw(PG_FUNCTION_ARGS)
|
||||
unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
|
||||
unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
|
||||
int len = PG_GETARG_INT32(4);
|
||||
unsigned char *buf;
|
||||
|
||||
CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
|
||||
|
||||
buf = palloc(len * ENCODING_GROWTH_RATE + 1);
|
||||
big52mic(src, buf, len);
|
||||
mic2euc_tw(buf, dest, strlen((char *) buf));
|
||||
pfree(buf);
|
||||
big52euc_tw(src, dest, len);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
@ -134,6 +128,136 @@ mic_to_big5(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* EUC_TW ---> Big5
|
||||
*/
|
||||
static void
|
||||
euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
|
||||
{
|
||||
unsigned char c1;
|
||||
unsigned short big5buf,
|
||||
cnsBuf;
|
||||
unsigned char lc;
|
||||
int l;
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
c1 = *euc;
|
||||
if (IS_HIGHBIT_SET(c1))
|
||||
{
|
||||
/* Verify and decode the next EUC_TW input character */
|
||||
l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
|
||||
if (l < 0)
|
||||
report_invalid_encoding(PG_EUC_TW,
|
||||
(const char *) euc, len);
|
||||
if (c1 == SS2)
|
||||
{
|
||||
c1 = euc[1]; /* plane No. */
|
||||
if (c1 == 0xa1)
|
||||
lc = LC_CNS11643_1;
|
||||
else if (c1 == 0xa2)
|
||||
lc = LC_CNS11643_2;
|
||||
else
|
||||
lc = c1 - 0xa3 + LC_CNS11643_3;
|
||||
cnsBuf = (euc[2] << 8) | euc[3];
|
||||
}
|
||||
else
|
||||
{ /* CNS11643-1 */
|
||||
lc = LC_CNS11643_1;
|
||||
cnsBuf = (c1 << 8) | euc[1];
|
||||
}
|
||||
|
||||
/* Write it out in Big5 */
|
||||
big5buf = CNStoBIG5(cnsBuf, lc);
|
||||
if (big5buf == 0)
|
||||
report_untranslatable_char(PG_EUC_TW, PG_BIG5,
|
||||
(const char *) euc, len);
|
||||
*p++ = (big5buf >> 8) & 0x00ff;
|
||||
*p++ = big5buf & 0x00ff;
|
||||
|
||||
euc += l;
|
||||
len -= l;
|
||||
}
|
||||
else
|
||||
{ /* should be ASCII */
|
||||
if (c1 == 0)
|
||||
report_invalid_encoding(PG_EUC_TW,
|
||||
(const char *) euc, len);
|
||||
*p++ = c1;
|
||||
euc++;
|
||||
len--;
|
||||
}
|
||||
}
|
||||
*p = '\0';
|
||||
}
|
||||
|
||||
/*
|
||||
* Big5 ---> EUC_TW
|
||||
*/
|
||||
static void
|
||||
big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
|
||||
{
|
||||
unsigned short c1;
|
||||
unsigned short big5buf,
|
||||
cnsBuf;
|
||||
unsigned char lc;
|
||||
int l;
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
/* Verify and decode the next Big5 input character */
|
||||
c1 = *big5;
|
||||
if (IS_HIGHBIT_SET(c1))
|
||||
{
|
||||
l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
|
||||
if (l < 0)
|
||||
report_invalid_encoding(PG_BIG5,
|
||||
(const char *) big5, len);
|
||||
big5buf = (c1 << 8) | big5[1];
|
||||
cnsBuf = BIG5toCNS(big5buf, &lc);
|
||||
|
||||
if (lc == LC_CNS11643_1)
|
||||
{
|
||||
*p++ = (cnsBuf >> 8) & 0x00ff;
|
||||
*p++ = cnsBuf & 0x00ff;
|
||||
}
|
||||
else if (lc == LC_CNS11643_2)
|
||||
{
|
||||
*p++ = SS2;
|
||||
*p++ = 0xa2;
|
||||
*p++ = (cnsBuf >> 8) & 0x00ff;
|
||||
*p++ = cnsBuf & 0x00ff;
|
||||
}
|
||||
else if (lc >= LC_CNS11643_3 && lc <= LC_CNS11643_7)
|
||||
{
|
||||
*p++ = SS2;
|
||||
*p++ = lc - LC_CNS11643_3 + 0xa3;
|
||||
*p++ = (cnsBuf >> 8) & 0x00ff;
|
||||
*p++ = cnsBuf & 0x00ff;
|
||||
}
|
||||
else
|
||||
report_untranslatable_char(PG_BIG5, PG_EUC_TW,
|
||||
(const char *) big5, len);
|
||||
|
||||
big5 += l;
|
||||
len -= l;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ASCII */
|
||||
if (c1 == 0)
|
||||
report_invalid_encoding(PG_BIG5,
|
||||
(const char *) big5, len);
|
||||
*p++ = c1;
|
||||
big5++;
|
||||
len--;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
*p = '\0';
|
||||
}
|
||||
|
||||
/*
|
||||
* EUC_TW ---> MIC
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user