Extend GB18030 encoding conversion to cover full Unicode range.

Our previous code for GB18030 <-> UTF8 conversion only covered Unicode code
points up to U+FFFF, but the actual spec defines conversions for all code
points up to U+10FFFF.  That would be rather impractical as a lookup table,
but fortunately there is a simple algorithmic conversion between the
additional code points and the equivalent GB18030 byte patterns.  Make use
of the just-added callback facility in LocalToUtf/UtfToLocal to perform the
additional conversions.

Having created the infrastructure to do that, we can use the same code to
map certain linearly-related subranges of the Unicode space below U+FFFF,
allowing removal of the corresponding lookup table entries.  This more
than halves the lookup table size, which is a substantial savings;
utf8_and_gb18030.so drops from nearly a megabyte to about half that.

In support of doing that, replace ISO10646-GB18030.TXT with the data file
gb-18030-2000.xml (retrieved from
http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ )
in which these subranges have been deleted from the simple lookup entries.

Per bug #12845 from Arjen Nienhuis.  The conversion code added here is
based on his proposed patch, though I whacked it around rather heavily.
This commit is contained in:
Tom Lane 2015-05-15 15:01:59 -04:00
parent 92edba2665
commit 8d3e0906df
7 changed files with 31111 additions and 128805 deletions

File diff suppressed because it is too large Load Diff

View File

@ -86,14 +86,14 @@ euc_tw_to_utf8.map utf8_to_euc_tw.map : CNS11643.TXT
sjis_to_utf8.map utf8_to_sjis.map : CP932.TXT
$(PERL) $(srcdir)/UCS_to_SJIS.pl
gb18030_to_utf8.map utf8_to_gb18030.map : ISO10646-GB18030.TXT
gb18030_to_utf8.map utf8_to_gb18030.map : gb-18030-2000.xml
$(PERL) $(srcdir)/UCS_to_GB18030.pl
big5_to_utf8.map utf8_to_big5.map : BIG5.TXT CP950.TXT
$(PERL) $(srcdir)/UCS_to_BIG5.pl
clean:
rm -f $(MAPS)
distclean: clean
rm -f $(TEXTS)
maintainer-clean: distclean
rm -f $(MAPS)

View File

@ -5,42 +5,46 @@
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
# Generate UTF-8 <--> GB18030 code conversion tables from
# "ISO10646-GB18030.TXT"
# "gb-18030-2000.xml"
#
# file format:
# GB18030 hex code
# UCS-2 hex code
# The lines we care about in the source file look like
# <a u="009A" b="81 30 83 36"/>
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
require "ucs2utf.pl";
# first generate UTF-8 --> GB18030 table
# Read the input
$in_file = "ISO10646-GB18030.TXT";
$in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($u, $c, $rest) = split;
next if (! m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
$u = $1;
$c = $2;
$c =~ s/ //g;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
if ($arrayu{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
if ($arrayc{$code} ne "")
{
printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
next;
}
$arrayu{$utf} = $code;
$arrayc{$code} = $utf;
$count++;
$array{$utf} = $code;
}
}
close(FILE);
@ -54,11 +58,12 @@ $file = "utf8_to_gb18030.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayu))
{
$code = $array{$index};
$count--;
if ($count == 0)
$code = $arrayu{$index};
$cc--;
if ($cc == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
@ -75,43 +80,17 @@ close(FILE);
#
# then generate GB18030 --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($u, $c, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = "gb18030_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayc))
{
$utf = $array{$index};
$count--;
if ($count == 0)
$utf = $arrayc{$index};
$cc--;
if ($cc == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -25,6 +25,161 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030);
extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS);
extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS);
/*
* Convert 4-byte GB18030 characters to and from a linear code space
*
* The first and third bytes can range from 0x81 to 0xfe (126 values),
* while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
*/
static inline uint32
gb_linear(uint32 gb)
{
uint32 b0 = (gb & 0xff000000) >> 24;
uint32 b1 = (gb & 0x00ff0000) >> 16;
uint32 b2 = (gb & 0x0000ff00) >> 8;
uint32 b3 = (gb & 0x000000ff);
return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
(0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
}
static inline uint32
gb_unlinear(uint32 lin)
{
uint32 r0 = 0x81 + lin / 12600;
uint32 r1 = 0x30 + (lin / 1260) % 10;
uint32 r2 = 0x81 + (lin / 10) % 126;
uint32 r3 = 0x30 + lin % 10;
return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
}
/*
* Convert word-formatted UTF8 to and from Unicode code points
*
* Probably this should be somewhere else ...
*/
static inline uint32
unicode_to_utf8word(uint32 c)
{
uint32 word;
if (c <= 0x7F)
{
word = c;
}
else if (c <= 0x7FF)
{
word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
word |= 0x80 | (c & 0x3F);
}
else if (c <= 0xFFFF)
{
word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
word |= 0x80 | (c & 0x3F);
}
else
{
word = (0xF0 | ((c >> 18) & 0x07)) << 24;
word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
word |= 0x80 | (c & 0x3F);
}
return word;
}
static inline uint32
utf8word_to_unicode(uint32 c)
{
uint32 ucs;
if (c <= 0x7F)
{
ucs = c;
}
else if (c <= 0xFFFF)
{
ucs = ((c >> 8) & 0x1F) << 6;
ucs |= c & 0x3F;
}
else if (c <= 0xFFFFFF)
{
ucs = ((c >> 16) & 0x0F) << 12;
ucs |= ((c >> 8) & 0x3F) << 6;
ucs |= c & 0x3F;
}
else
{
ucs = ((c >> 24) & 0x07) << 18;
ucs |= ((c >> 16) & 0x3F) << 12;
ucs |= ((c >> 8) & 0x3F) << 6;
ucs |= c & 0x3F;
}
return ucs;
}
/*
* Perform mapping of GB18030 ranges to UTF8
*
* The ranges we need to convert are specified in gb-18030-2000.xml.
* All are ranges of 4-byte GB18030 codes.
*/
static uint32
conv_18030_to_utf8(uint32 code)
{
#define conv18030(minunicode, mincode, maxcode) \
if (code >= mincode && code <= maxcode) \
return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
conv18030(0x0452, 0x8130D330, 0x8136A531);
conv18030(0x2643, 0x8137A839, 0x8138FD38);
conv18030(0x361B, 0x8230A633, 0x8230F237);
conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
conv18030(0x4160, 0x8232C937, 0x8232F837);
conv18030(0x44D7, 0x8233A339, 0x8233C931);
conv18030(0x478E, 0x8233E838, 0x82349638);
conv18030(0x49B8, 0x8234A131, 0x8234E733);
conv18030(0x9FA6, 0x82358F33, 0x8336C738);
conv18030(0xE865, 0x8336D030, 0x84308534);
conv18030(0xFA2A, 0x84309C38, 0x84318537);
conv18030(0xFFE6, 0x8431A234, 0x8431A439);
conv18030(0x10000, 0x90308130, 0xE3329A35);
/* No mapping exists */
return 0;
}
/*
* Perform mapping of UTF8 ranges to GB18030
*/
static uint32
conv_utf8_to_18030(uint32 code)
{
uint32 ucs = utf8word_to_unicode(code);
#define convutf8(minunicode, maxunicode, mincode) \
if (ucs >= minunicode && ucs <= maxunicode) \
return gb_unlinear(ucs - minunicode + gb_linear(mincode))
convutf8(0x0452, 0x200F, 0x8130D330);
convutf8(0x2643, 0x2E80, 0x8137A839);
convutf8(0x361B, 0x3917, 0x8230A633);
convutf8(0x3CE1, 0x4055, 0x8231D438);
convutf8(0x4160, 0x4336, 0x8232C937);
convutf8(0x44D7, 0x464B, 0x8233A339);
convutf8(0x478E, 0x4946, 0x8233E838);
convutf8(0x49B8, 0x4C76, 0x8234A131);
convutf8(0x9FA6, 0xD7FF, 0x82358F33);
convutf8(0xE865, 0xF92B, 0x8336D030);
convutf8(0xFA2A, 0xFE2F, 0x84309C38);
convutf8(0xFFE6, 0xFFFF, 0x8431A234);
convutf8(0x10000, 0x10FFFF, 0x90308130);
/* No mapping exists */
return 0;
}
/* ----------
* conv_proc(
* INTEGER, -- source encoding id
@ -47,7 +202,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
LocalToUtf(src, len, dest,
LUmapGB18030, lengthof(LUmapGB18030),
NULL, 0,
NULL,
conv_18030_to_utf8,
PG_GB18030);
PG_RETURN_VOID();
@ -65,7 +220,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
UtfToLocal(src, len, dest,
ULmapGB18030, lengthof(ULmapGB18030),
NULL, 0,
NULL,
conv_utf8_to_18030,
PG_GB18030);
PG_RETURN_VOID();