Extend GB18030 encoding conversion to cover full Unicode range.
Our previous code for GB18030 <-> UTF8 conversion only covered Unicode code points up to U+FFFF, but the actual spec defines conversions for all code points up to U+10FFFF. That would be rather impractical as a lookup table, but fortunately there is a simple algorithmic conversion between the additional code points and the equivalent GB18030 byte patterns. Make use of the just-added callback facility in LocalToUtf/UtfToLocal to perform the additional conversions. Having created the infrastructure to do that, we can use the same code to map certain linearly-related subranges of the Unicode space below U+FFFF, allowing removal of the corresponding lookup table entries. This more than halves the lookup table size, which is a substantial savings; utf8_and_gb18030.so drops from nearly a megabyte to about half that. In support of doing that, replace ISO10646-GB18030.TXT with the data file gb-18030-2000.xml (retrieved from http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ ) in which these subranges have been deleted from the simple lookup entries. Per bug #12845 from Arjen Nienhuis. The conversion code added here is based on his proposed patch, though I whacked it around rather heavily.
This commit is contained in:
parent
92edba2665
commit
8d3e0906df
File diff suppressed because it is too large
Load Diff
|
@ -86,14 +86,14 @@ euc_tw_to_utf8.map utf8_to_euc_tw.map : CNS11643.TXT
|
|||
sjis_to_utf8.map utf8_to_sjis.map : CP932.TXT
|
||||
$(PERL) $(srcdir)/UCS_to_SJIS.pl
|
||||
|
||||
gb18030_to_utf8.map utf8_to_gb18030.map : ISO10646-GB18030.TXT
|
||||
gb18030_to_utf8.map utf8_to_gb18030.map : gb-18030-2000.xml
|
||||
$(PERL) $(srcdir)/UCS_to_GB18030.pl
|
||||
|
||||
big5_to_utf8.map utf8_to_big5.map : BIG5.TXT CP950.TXT
|
||||
$(PERL) $(srcdir)/UCS_to_BIG5.pl
|
||||
|
||||
clean:
|
||||
rm -f $(MAPS)
|
||||
|
||||
distclean: clean
|
||||
rm -f $(TEXTS)
|
||||
|
||||
maintainer-clean: distclean
|
||||
rm -f $(MAPS)
|
||||
|
|
|
@ -5,42 +5,46 @@
|
|||
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
|
||||
#
|
||||
# Generate UTF-8 <--> GB18030 code conversion tables from
|
||||
# "ISO10646-GB18030.TXT"
|
||||
# "gb-18030-2000.xml"
|
||||
#
|
||||
# file format:
|
||||
# GB18030 hex code
|
||||
# UCS-2 hex code
|
||||
# The lines we care about in the source file look like
|
||||
# <a u="009A" b="81 30 83 36"/>
|
||||
# where the "u" field is the Unicode code point in hex,
|
||||
# and the "b" field is the hex byte sequence for GB18030
|
||||
|
||||
require "ucs2utf.pl";
|
||||
|
||||
|
||||
# first generate UTF-8 --> GB18030 table
|
||||
# Read the input
|
||||
|
||||
$in_file = "ISO10646-GB18030.TXT";
|
||||
$in_file = "gb-18030-2000.xml";
|
||||
|
||||
open(FILE, $in_file) || die("cannot open $in_file");
|
||||
|
||||
while (<FILE>)
|
||||
{
|
||||
chop;
|
||||
if (/^#/)
|
||||
{
|
||||
next;
|
||||
}
|
||||
($u, $c, $rest) = split;
|
||||
next if (! m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
|
||||
$u = $1;
|
||||
$c = $2;
|
||||
$c =~ s/ //g;
|
||||
$ucs = hex($u);
|
||||
$code = hex($c);
|
||||
if ($code >= 0x80 && $ucs >= 0x0080)
|
||||
{
|
||||
$utf = &ucs2utf($ucs);
|
||||
if ($array{$utf} ne "")
|
||||
if ($arrayu{$utf} ne "")
|
||||
{
|
||||
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
|
||||
next;
|
||||
}
|
||||
if ($arrayc{$code} ne "")
|
||||
{
|
||||
printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
|
||||
next;
|
||||
}
|
||||
$arrayu{$utf} = $code;
|
||||
$arrayc{$code} = $utf;
|
||||
$count++;
|
||||
|
||||
$array{$utf} = $code;
|
||||
}
|
||||
}
|
||||
close(FILE);
|
||||
|
@ -54,11 +58,12 @@ $file = "utf8_to_gb18030.map";
|
|||
open(FILE, "> $file") || die("cannot open $file");
|
||||
print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
|
||||
|
||||
for $index (sort { $a <=> $b } keys(%array))
|
||||
$cc = $count;
|
||||
for $index (sort { $a <=> $b } keys(%arrayu))
|
||||
{
|
||||
$code = $array{$index};
|
||||
$count--;
|
||||
if ($count == 0)
|
||||
$code = $arrayu{$index};
|
||||
$cc--;
|
||||
if ($cc == 0)
|
||||
{
|
||||
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
|
||||
}
|
||||
|
@ -75,43 +80,17 @@ close(FILE);
|
|||
#
|
||||
# then generate GB18030 --> UTF8 table
|
||||
#
|
||||
reset 'array';
|
||||
|
||||
open(FILE, $in_file) || die("cannot open $in_file");
|
||||
|
||||
while (<FILE>)
|
||||
{
|
||||
chop;
|
||||
if (/^#/)
|
||||
{
|
||||
next;
|
||||
}
|
||||
($u, $c, $rest) = split;
|
||||
$ucs = hex($u);
|
||||
$code = hex($c);
|
||||
if ($code >= 0x80 && $ucs >= 0x0080)
|
||||
{
|
||||
$utf = &ucs2utf($ucs);
|
||||
if ($array{$code} ne "")
|
||||
{
|
||||
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
|
||||
next;
|
||||
}
|
||||
$count++;
|
||||
|
||||
$array{$code} = $utf;
|
||||
}
|
||||
}
|
||||
close(FILE);
|
||||
|
||||
$file = "gb18030_to_utf8.map";
|
||||
open(FILE, "> $file") || die("cannot open $file");
|
||||
print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
|
||||
for $index (sort { $a <=> $b } keys(%array))
|
||||
|
||||
$cc = $count;
|
||||
for $index (sort { $a <=> $b } keys(%arrayc))
|
||||
{
|
||||
$utf = $array{$index};
|
||||
$count--;
|
||||
if ($count == 0)
|
||||
$utf = $arrayc{$index};
|
||||
$cc--;
|
||||
if ($cc == 0)
|
||||
{
|
||||
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -25,6 +25,161 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030);
|
|||
extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS);
|
||||
extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS);
|
||||
|
||||
/*
|
||||
* Convert 4-byte GB18030 characters to and from a linear code space
|
||||
*
|
||||
* The first and third bytes can range from 0x81 to 0xfe (126 values),
|
||||
* while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
|
||||
*/
|
||||
static inline uint32
|
||||
gb_linear(uint32 gb)
|
||||
{
|
||||
uint32 b0 = (gb & 0xff000000) >> 24;
|
||||
uint32 b1 = (gb & 0x00ff0000) >> 16;
|
||||
uint32 b2 = (gb & 0x0000ff00) >> 8;
|
||||
uint32 b3 = (gb & 0x000000ff);
|
||||
|
||||
return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
|
||||
(0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
|
||||
}
|
||||
|
||||
static inline uint32
|
||||
gb_unlinear(uint32 lin)
|
||||
{
|
||||
uint32 r0 = 0x81 + lin / 12600;
|
||||
uint32 r1 = 0x30 + (lin / 1260) % 10;
|
||||
uint32 r2 = 0x81 + (lin / 10) % 126;
|
||||
uint32 r3 = 0x30 + lin % 10;
|
||||
|
||||
return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert word-formatted UTF8 to and from Unicode code points
|
||||
*
|
||||
* Probably this should be somewhere else ...
|
||||
*/
|
||||
static inline uint32
|
||||
unicode_to_utf8word(uint32 c)
|
||||
{
|
||||
uint32 word;
|
||||
|
||||
if (c <= 0x7F)
|
||||
{
|
||||
word = c;
|
||||
}
|
||||
else if (c <= 0x7FF)
|
||||
{
|
||||
word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
|
||||
word |= 0x80 | (c & 0x3F);
|
||||
}
|
||||
else if (c <= 0xFFFF)
|
||||
{
|
||||
word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
|
||||
word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
|
||||
word |= 0x80 | (c & 0x3F);
|
||||
}
|
||||
else
|
||||
{
|
||||
word = (0xF0 | ((c >> 18) & 0x07)) << 24;
|
||||
word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
|
||||
word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
|
||||
word |= 0x80 | (c & 0x3F);
|
||||
}
|
||||
|
||||
return word;
|
||||
}
|
||||
|
||||
static inline uint32
|
||||
utf8word_to_unicode(uint32 c)
|
||||
{
|
||||
uint32 ucs;
|
||||
|
||||
if (c <= 0x7F)
|
||||
{
|
||||
ucs = c;
|
||||
}
|
||||
else if (c <= 0xFFFF)
|
||||
{
|
||||
ucs = ((c >> 8) & 0x1F) << 6;
|
||||
ucs |= c & 0x3F;
|
||||
}
|
||||
else if (c <= 0xFFFFFF)
|
||||
{
|
||||
ucs = ((c >> 16) & 0x0F) << 12;
|
||||
ucs |= ((c >> 8) & 0x3F) << 6;
|
||||
ucs |= c & 0x3F;
|
||||
}
|
||||
else
|
||||
{
|
||||
ucs = ((c >> 24) & 0x07) << 18;
|
||||
ucs |= ((c >> 16) & 0x3F) << 12;
|
||||
ucs |= ((c >> 8) & 0x3F) << 6;
|
||||
ucs |= c & 0x3F;
|
||||
}
|
||||
|
||||
return ucs;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform mapping of GB18030 ranges to UTF8
|
||||
*
|
||||
* The ranges we need to convert are specified in gb-18030-2000.xml.
|
||||
* All are ranges of 4-byte GB18030 codes.
|
||||
*/
|
||||
static uint32
|
||||
conv_18030_to_utf8(uint32 code)
|
||||
{
|
||||
#define conv18030(minunicode, mincode, maxcode) \
|
||||
if (code >= mincode && code <= maxcode) \
|
||||
return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
|
||||
|
||||
conv18030(0x0452, 0x8130D330, 0x8136A531);
|
||||
conv18030(0x2643, 0x8137A839, 0x8138FD38);
|
||||
conv18030(0x361B, 0x8230A633, 0x8230F237);
|
||||
conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
|
||||
conv18030(0x4160, 0x8232C937, 0x8232F837);
|
||||
conv18030(0x44D7, 0x8233A339, 0x8233C931);
|
||||
conv18030(0x478E, 0x8233E838, 0x82349638);
|
||||
conv18030(0x49B8, 0x8234A131, 0x8234E733);
|
||||
conv18030(0x9FA6, 0x82358F33, 0x8336C738);
|
||||
conv18030(0xE865, 0x8336D030, 0x84308534);
|
||||
conv18030(0xFA2A, 0x84309C38, 0x84318537);
|
||||
conv18030(0xFFE6, 0x8431A234, 0x8431A439);
|
||||
conv18030(0x10000, 0x90308130, 0xE3329A35);
|
||||
/* No mapping exists */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform mapping of UTF8 ranges to GB18030
|
||||
*/
|
||||
static uint32
|
||||
conv_utf8_to_18030(uint32 code)
|
||||
{
|
||||
uint32 ucs = utf8word_to_unicode(code);
|
||||
|
||||
#define convutf8(minunicode, maxunicode, mincode) \
|
||||
if (ucs >= minunicode && ucs <= maxunicode) \
|
||||
return gb_unlinear(ucs - minunicode + gb_linear(mincode))
|
||||
|
||||
convutf8(0x0452, 0x200F, 0x8130D330);
|
||||
convutf8(0x2643, 0x2E80, 0x8137A839);
|
||||
convutf8(0x361B, 0x3917, 0x8230A633);
|
||||
convutf8(0x3CE1, 0x4055, 0x8231D438);
|
||||
convutf8(0x4160, 0x4336, 0x8232C937);
|
||||
convutf8(0x44D7, 0x464B, 0x8233A339);
|
||||
convutf8(0x478E, 0x4946, 0x8233E838);
|
||||
convutf8(0x49B8, 0x4C76, 0x8234A131);
|
||||
convutf8(0x9FA6, 0xD7FF, 0x82358F33);
|
||||
convutf8(0xE865, 0xF92B, 0x8336D030);
|
||||
convutf8(0xFA2A, 0xFE2F, 0x84309C38);
|
||||
convutf8(0xFFE6, 0xFFFF, 0x8431A234);
|
||||
convutf8(0x10000, 0x10FFFF, 0x90308130);
|
||||
/* No mapping exists */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ----------
|
||||
* conv_proc(
|
||||
* INTEGER, -- source encoding id
|
||||
|
@ -47,7 +202,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
|
|||
LocalToUtf(src, len, dest,
|
||||
LUmapGB18030, lengthof(LUmapGB18030),
|
||||
NULL, 0,
|
||||
NULL,
|
||||
conv_18030_to_utf8,
|
||||
PG_GB18030);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
|
@ -65,7 +220,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
|
|||
UtfToLocal(src, len, dest,
|
||||
ULmapGB18030, lengthof(ULmapGB18030),
|
||||
NULL, 0,
|
||||
NULL,
|
||||
conv_utf8_to_18030,
|
||||
PG_GB18030);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
|
|
Loading…
Reference in New Issue