Rewrite the perl scripts to produce our Unicode conversion tables.

Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no
longer available.

Get UHC from windows-949-2000.xml, it's more up-to-date.

Plus tons more small changes. With these changes, the perl scripts
faithfully produce the *.map files we have in the repository, from the
external source files.

In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT.

Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson.

Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi
This commit is contained in:
Heikki Linnakangas 2016-11-30 14:54:02 +02:00
parent 6c303223be
commit 1de9cc0dcc
33 changed files with 809 additions and 1559 deletions

View File

@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \
win1258_to_utf8.map utf8_to_win1258.map
GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \
johab_to_utf8.map utf8_to_johab.map \
uhc_to_utf8.map utf8_to_uhc.map \
gbk_to_utf8.map utf8_to_gbk.map \
koi8r_to_utf8.map utf8_to_koi8r.map
@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
sjis_to_utf8.map utf8_to_sjis.map \
gb18030_to_utf8.map utf8_to_gb18030.map \
big5_to_utf8.map utf8_to_big5.map \
johab_to_utf8.map utf8_to_johab.map \
uhc_to_utf8.map utf8_to_uhc.map \
euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \
8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \
8859-16.TXT
WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \
WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \
CP1250.TXT CP1251.TXT \
CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \
CP1256.TXT CP1257.TXT CP1258.TXT
GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \
KOI8-R.TXT KOI8-U.TXT JOHAB.TXT
KOI8-R.TXT KOI8-U.TXT
all: $(MAPS)
$(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS)
$(PERL) $<
euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT
johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT
$(PERL) $<
euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT
uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml
$(PERL) $<
euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT
$(PERL) $<
euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml
$(PERL) $<
euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT
@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT:
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
gb-18030-2000.xml:
gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
GB2312.TXT:
@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT:
$(ISO8859TEXTS):
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
$(filter-out CP8%,$(WINTEXTS)):
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
$(filter CP8%,$(WINTEXTS)):

View File

@ -25,56 +25,17 @@
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
# Load BIG5.TXT
my $all = &read_source("BIG5.TXT");
#
# first, generate UTF8 --> BIG5 table
#
$in_file = "BIG5.TXT";
# Load CP950.TXT
my $cp950txt = &read_source("CP950.TXT");
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
}
}
close(FILE);
$in_file = "CP950.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
foreach my $i (@$cp950txt) {
my $code = $i->{code};
my $ucs = $i->{ucs};
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
# from CP950.TXT
@ -83,126 +44,25 @@ while (<FILE>)
&& $code >= 0xf9d6
&& $code <= 0xf9dc)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
}
}
close(FILE);
$file = lc("utf8_to_big5.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
push @$all, {code => $code,
ucs => $ucs,
comment => $i->{comment},
direction => "both"};
}
}
print FILE "};\n";
close(FILE);
foreach my $i (@$all) {
my $code = $i->{code};
my $ucs = $i->{ucs};
#
# then generate BIG5 --> UTF8 table
#
$in_file = "BIG5.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
# BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can
# contain only one of them. XXX: Doesn't really make sense to include any of them,
# but for historical reasons, we map the first one of them.
if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$in_file = "CP950.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
# from CP950.TXT
if ( $code >= 0x80
&& $ucs >= 0x0080
&& $code >= 0xf9d6
&& $code <= 0xf9dc)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = lc("big5_to_utf8.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
$i->{direction} = "to_unicode";
}
}
print FILE "};\n";
close(FILE);
# Output
print_tables("BIG5", $all);

View File

@ -1,128 +1,76 @@
#! /usr/bin/perl
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
# Copyright (c) 2007-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
# Generate UTF-8 <--> EUC_CN code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain GB2312.TXT from
# the organization's ftp site.
# Generate UTF-8 <--> GB18030 code conversion tables from
# "gb-18030-2000.xml", obtained from
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
#
# GB2312.TXT format:
# GB2312 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
# The lines we care about in the source file look like
# <a u="009A" b="81 30 83 36"/>
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> EUC_CN table
# Read the input
$in_file = "GB2312.TXT";
$in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = ($code | 0x8080);
}
}
close(FILE);
$file = "utf8_to_euc_cn.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate EUC_CN --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
my @mapping;
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
$u = $1;
$c = $2;
$c =~ s/ //g;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$code |= 0x8080;
$array{$code} = $utf;
# The GB-18030 character set, which we use as the source, contains
# a lot of extra characters on top of the GB2312 character set that
# EUC_CN encodes. Filter out those extra characters.
next if (($code & 0xFF) < 0xA1);
next if (!($code >= 0xA100 && $code <= 0xA9FF ||
$code >= 0xB000 && $code <= 0xF7FF));
next if ($code >= 0xA2A1 && $code <= 0xA2B0);
next if ($code >= 0xA2E3 && $code <= 0xA2E4);
next if ($code >= 0xA2EF && $code <= 0xA2F0);
next if ($code >= 0xA2FD && $code <= 0xA2FE);
next if ($code >= 0xA4F4 && $code <= 0xA4FE);
next if ($code >= 0xA5F7 && $code <= 0xA5FE);
next if ($code >= 0xA6B9 && $code <= 0xA6C0);
next if ($code >= 0xA6D9 && $code <= 0xA6FE);
next if ($code >= 0xA7C2 && $code <= 0xA7D0);
next if ($code >= 0xA7F2 && $code <= 0xA7FE);
next if ($code >= 0xA8BB && $code <= 0xA8C4);
next if ($code >= 0xA8EA && $code <= 0xA8FE);
next if ($code >= 0xA9A1 && $code <= 0xA9A3);
next if ($code >= 0xA9F0 && $code <= 0xA9FE);
next if ($code >= 0xD7FA && $code <= 0xD7FE);
# A couple of characters are mapped differently from GB-2312 or GB-18030
if ($code == 0xA1A4)
{
$ucs = 0x30FB;
}
if ($code == 0xA1AA)
{
$ucs = 0x2015;
}
push @mapping, {
ucs => $ucs,
code => $code,
direction => 'both'
}
}
close(FILE);
$file = "euc_cn_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("EUC_CN", \@mapping);

View File

@ -7,9 +7,7 @@
# Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from
# "euc-jis-2004-std.txt" (http://x0213.org)
require "ucs2utf.pl";
$TEST = 0;
require "convutils.pm";
# first generate UTF-8 --> EUC_JIS_2004 table
@ -17,10 +15,7 @@ $in_file = "euc-jis-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
my @all;
while ($line = <FILE>)
{
@ -31,14 +26,14 @@ while ($line = <FILE>)
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$str} = $code;
$comment1{$str} = $rest;
$count1++;
$ucs1 = hex($u1);
$ucs2 = hex($u2);
push @all, { direction => 'both',
ucs => $ucs1,
ucs_second => $ucs2,
code => $code,
comment => $rest };
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
@ -54,252 +49,11 @@ while ($line = <FILE>)
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
$comment{$code} = $rest;
next if ($code < 0x80 && $ucs < 0x80);
push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest };
}
close(FILE);
$file = "utf8_to_euc_jis_2004.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_utf_to_local ULmapEUC_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code,
$comment{$code};
}
else
{
printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code,
$comment{$code};
}
}
print FILE "};\n";
close(FILE);
if ($TEST == 1)
{
$file1 = "utf8.data";
$file2 = "euc_jis_2004.data";
open(FILE1, "> $file1") || die("cannot open $file1");
open(FILE2, "> $file2") || die("cannot open $file2");
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
if ( $code > 0x00
&& $code != 0x09
&& $code != 0x0a
&& $code != 0x0d
&& $code != 0x5c
&& ( $code < 0x80
|| ($code >= 0x8ea1 && $code <= 0x8efe)
|| ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
|| ($code >= 0xa1a1 && $code <= 0x8fefe)))
{
for ($i = 3; $i >= 0; $i--)
{
$s = $i * 8;
$mask = 0xff << $s;
print FILE1 pack("C", ($index & $mask) >> $s)
if $index & $mask;
print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
}
print FILE1 "\n";
print FILE2 "\n";
}
}
}
$file = "utf8_to_euc_jis_2004_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n";
for $index (sort { $a cmp $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8),
substr($index, 8, 8), $code, $comment1{$index};
}
else
{
printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n",
substr($index, 0, 8), substr($index, 8, 8), $code,
$comment1{$index};
}
}
print FILE "};\n";
close(FILE);
if ($TEST == 1)
{
for $index (sort { $a cmp $b } keys(%array1))
{
$code = $array1{$index};
if ( $code > 0x00
&& $code != 0x09
&& $code != 0x0a
&& $code != 0x0d
&& $code != 0x5c
&& ( $code < 0x80
|| ($code >= 0x8ea1 && $code <= 0x8efe)
|| ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
|| ($code >= 0xa1a1 && $code <= 0x8fefe)))
{
$v1 = hex(substr($index, 0, 8));
$v2 = hex(substr($index, 8, 8));
for ($i = 3; $i >= 0; $i--)
{
$s = $i * 8;
$mask = 0xff << $s;
print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask;
print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
}
for ($i = 3; $i >= 0; $i--)
{
$s = $i * 8;
$mask = 0xff << $s;
print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask;
}
print FILE1 "\n";
print FILE2 "\n";
}
}
close(FILE1);
close(FILE2);
}
# then generate EUC_JIS_2004 --> UTF-8 table
$in_file = "euc-jis-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
while ($line = <FILE>)
{
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u1 = $2;
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$code} = $str;
$comment1{$code} = $rest;
$count1++;
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u = $2;
$rest = "U+" . $u . $3;
}
else
{
next;
}
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
$comment{$utf} = $rest;
}
close(FILE);
$file = "euc_jis_2004_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_local_to_utf LUmapEUC_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code,
$comment{$code};
}
else
{
printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code,
$comment{$code};
}
}
print FILE "};\n";
close(FILE);
$file = "euc_jis_2004_to_utf8_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n";
for $index (sort { $a <=> $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
else
{
printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
}
print FILE "};\n";
close(FILE);
print_tables("EUC_JIS_2004", \@all, 1);

View File

@ -8,275 +8,223 @@
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain JIS0201.TXT, JIS0208.TXT, JIS0212.TXT from
# the organization's ftp site.
#
# JIS0201.TXT format:
# JIS0201 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
#
# JIS0208.TXT format:
# JIS0208 shift-JIS code in hex
# JIS0208 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
#
# JIS0212.TXT format:
# JIS0212 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
# you have to obtain CP932.TXT and JIS0212.TXT from the
# organization's ftp site.
require "ucs2utf.pl";
use strict;
require "convutils.pm";
# first generate UTF-8 --> EUC_JP table
# Load JIS0212.TXT
my $jis0212 = &read_source("JIS0212.TXT");
#
# JIS0201
#
$in_file = "JIS0201.TXT";
my @mapping;
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
foreach my $i (@$jis0212) {
# We have a different mapping for this in the EUC_JP to UTF-8 direction.
if ($i->{code} == 0x2243)
{
next;
$i->{direction} = "from_unicode";
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
# add single shift 2
$array{$utf} = ($code | 0x8e00);
if ($i->{code} == 0x2271)
{
$i->{direction} = "to_unicode";
}
}
close(FILE);
#
# JIS0208
#
$in_file = "JIS0208.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
if ($i->{ucs} >= 0x080)
{
next;
}
($s, $c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = ($code | 0x8080);
}
}
close(FILE);
#
# JIS0212
#
$in_file = "JIS0212.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = ($code | 0x8f8080);
}
}
close(FILE);
$file = "utf8_to_euc_jp.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_JP[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
$i->{code} = $i->{code} | 0x8f8080;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
next;
}
push @mapping, $i;
}
print FILE "};\n";
close(FILE);
# Load CP932.TXT.
my $ct932 = &read_source("CP932.TXT");
#
# then generate EUC_JP --> UTF8 table
#
foreach my $i (@$ct932) {
my $sjis = $i->{code};
#
# JIS0201
#
$in_file = "JIS0201.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
# We have a different mapping for this in the EUC_JP to UTF-8 direction.
if ($sjis == 0xeefa ||
$sjis == 0xeefb ||
$sjis == 0xeefc)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
# add single shift 2
$code |= 0x8e00;
$array{$code} = $utf;
if ($sjis >= 0xa1)
{
my $jis = &sjis2jis($sjis);
$i->{code} = $jis | ($jis < 0x100 ? 0x8e00 :
($sjis >= 0xeffd ? 0x8f8080 : 0x8080));
# Remember the SJIS code for later.
$i->{sjis} = $sjis;
push @mapping, $i;
}
}
close(FILE);
#
# JIS0208
#
$in_file = "JIS0208.TXT";
foreach my $i (@mapping) {
my $sjis = $i->{sjis};
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
# These SJIS characters are excluded completely.
if ($sjis >= 0xed00 && $sjis <= 0xeef9 ||
$sjis >= 0xfa54 && $sjis <= 0xfa56 ||
$sjis >= 0xfa58 && $sjis <= 0xfc4b)
{
$i->{direction} = "none";
next;
}
($s, $c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$code |= 0x8080;
$array{$code} = $utf;
}
}
close(FILE);
#
# JIS0212
#
$in_file = "JIS0212.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
# These SJIS characters are only in the UTF-8 to EUC_JP table
if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc)
{
$i->{direction} = "from_unicode";
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$code |= 0x8f8080;
$array{$code} = $utf;
if ($sjis == 0x8790 || $sjis == 0x8791 || $sjis == 0x8792 ||
$sjis == 0x8795 || $sjis == 0x8796 || $sjis == 0x8797 ||
$sjis == 0x879a || $sjis == 0x879b || $sjis == 0x879c ||
($sjis >= 0xfa4a && $sjis <= 0xfa53))
{
$i->{direction} = "to_unicode";
next;
}
}
close(FILE);
$file = "euc_jp_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
push @mapping, (
{direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)'},
{direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)'},
{direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)'},
{direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)'},
{direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)'},
{direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)'},
{direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)'},
{direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)'},
{direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)'},
{direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)'},
{direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)'},
{direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)'},
{direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)'},
{direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)'},
{direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)'},
{direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)'},
{direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)'},
{direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)'},
{direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)'},
{direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)'},
{direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)'},
{direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)'},
{direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)'},
{direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)'},
{direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)'},
{direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)'},
{direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)'},
{direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)'},
{direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)'},
{direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)'},
{direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)'},
{direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)'},
{direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)'},
{direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)'},
{direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)'},
{direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)'},
{direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)'},
{direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)'},
{direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)'},
{direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)'},
{direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)'},
{direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)'},
{direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)'},
{direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)'},
{direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)'},
{direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)'},
{direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)'},
{direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'},
{direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'},
{direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'},
{direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'},
{direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'},
{direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'},
{direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'},
{direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'},
{direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'},
{direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'},
{direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'},
{direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'},
{direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'},
{direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'},
{direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'},
{direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'},
{direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'},
{direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'},
{direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'},
{direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'},
{direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'},
{direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'},
{direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'},
{direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'},
{direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'},
{direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'},
{direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'},
{direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'},
{direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'},
{direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'},
{direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'},
{direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'},
{direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'},
{direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'},
{direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE'},
{direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR'},
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_JP[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
# additional conversions for EUC_JP -> UTF-8 conversion
{direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN'},
{direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN'},
{direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'}
);
print_tables("EUC_JP", \@mapping);
#######################################################################
# sjis2jis ; SJIS => JIS conversion
sub sjis2jis
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
my ($sjis) = @_;
print FILE "};\n";
close(FILE);
return $sjis if ($sjis <= 0x100);
my $hi = $sjis >> 8;
my $lo = $sjis & 0xff;
if ($lo >= 0x80) { $lo--; }
$lo -= 0x40;
if ($hi >= 0xe0) { $hi -= 0x40; }
$hi -= 0x81;
my $pos = $lo + $hi * 0xbc;
if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b)
{
# This region (115-ku) is out of range of JIS code but for
# convenient to generate code in EUC CODESET 3, move this to
# seemingly duplicate region (83-84-ku).
$pos = $pos - ((31 * 0x5e) + 12);
# after 85-ku 82-ten needs to be moved 2 codepoints
$pos = $pos - 2 if ($pos >= 84 * 0x5c + 82)
}
my $hi2 = $pos / 0x5e;
my $lo2 = ($pos % 0x5e);
my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8);
return $ret;
}

View File

@ -16,113 +16,22 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> EUC_KR table
# Load the source file.
$in_file = "KSX1001.TXT";
my $mapping = &read_source("KSX1001.TXT");
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
foreach my $i (@$mapping)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = ($code | 0x8080);
}
}
close(FILE);
$file = "utf8_to_euc_kr.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_KR[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
$i->{code} = $i->{code} | 0x8080;
}
print FILE "};\n";
close(FILE);
# Some extra characters that are not in KSX1001.TXT
push @$mapping, (
{direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'},
{direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'},
{direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'}
);
#
# then generate EUC_KR --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$code |= 0x8080;
$array{$code} = $utf;
}
}
close(FILE);
$file = "euc_kr_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_KR[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("EUC_KR", $mapping);

View File

@ -17,141 +17,47 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> EUC_TW table
my $mapping = &read_source("CNS11643.TXT");
$in_file = "CNS11643.TXT";
my @extras;
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
foreach my $i (@$mapping)
{
chop;
if (/^#/)
my $ucs = $i->{ucs};
my $code = $i->{code};
my $origcode = $i->{code};
my $plane = ($code & 0x1f0000) >> 16;
if ($plane > 16)
{
printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
if ($plane == 1)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$plane = ($code & 0x1f0000) >> 16;
if ($plane > 16)
{
printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
if ($plane == 1)
{
$array{$utf} = (($code & 0xffff) | 0x8080);
}
else
{
$array{$utf} =
(0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
}
}
}
close(FILE);
$file = "utf8_to_euc_tw.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_TW[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
$code = ($code & 0xffff) | 0x8080;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
$code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
}
$i->{code} = $code;
# Some codes are mapped twice in the EUC_TW to UTF-8 table.
if ($origcode >= 0x12121 && $origcode <= 0x20000)
{
push @extras, {
ucs => $i->{ucs},
code => ($i->{code} + 0x8ea10000),
rest => $i->{rest},
direction => 'to_unicode'
}
}
}
print FILE "};\n";
close(FILE);
push @$mapping, @extras;
#
# then generate EUC_TW --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$plane = ($code & 0x1f0000) >> 16;
if ($plane > 16)
{
printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
if ($plane == 1)
{
$c = (($code & 0xffff) | 0x8080);
$array{$c} = $utf;
$count++;
}
$c = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
$array{$c} = $utf;
}
}
close(FILE);
$file = "euc_tw_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_TW[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("EUC_TW", $mapping);

View File

@ -13,8 +13,7 @@
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
require "ucs2utf.pl";
require "convutils.pm";
# Read the input
@ -22,6 +21,8 @@ $in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
my @mapping;
while (<FILE>)
{
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
@ -32,78 +33,13 @@ while (<FILE>)
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($arrayu{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
push @mapping, {
ucs => $ucs,
code => $code,
direction => 'both'
}
if ($arrayc{$code} ne "")
{
printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
next;
}
$arrayu{$utf} = $code;
$arrayc{$code} = $utf;
$count++;
}
}
close(FILE);
#
# first, generate UTF8 --> GB18030 table
#
$file = "utf8_to_gb18030.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayu))
{
$code = $arrayu{$index};
$cc--;
if ($cc == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate GB18030 --> UTF8 table
#
$file = "gb18030_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayc))
{
$utf = $arrayc{$index};
$cc--;
if ($cc == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("GB18030", \@mapping);

View File

@ -0,0 +1,31 @@
#! /usr/bin/perl
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
#
# Generate UTF-8 <--> JOHAB conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain the map files from the organization's ftp site.
# ftp://www.unicode.org/Public/MAPPINGS/
# We assume the file include three tab-separated columns:
# JOHAB code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "convutils.pm";
# Load the source file.
my $mapping = &read_source("JOHAB.TXT");
# Some extra characters that are not in JOHAB.TXT
push @$mapping, (
{direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'},
{direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'},
{direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'}
);
print_tables("JOHAB", $mapping);

View File

@ -7,7 +7,7 @@
# Generate UTF-8 <--> SHIFT_JIS_2004 code conversion tables from
# "sjis-0213-2004-std.txt" (http://x0213.org)
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> SHIFT_JIS_2004 table
@ -15,10 +15,7 @@ $in_file = "sjis-0213-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
my @mapping;
while ($line = <FILE>)
{
@ -29,14 +26,16 @@ while ($line = <FILE>)
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$str} = $code;
$comment1{$str} = $rest;
$count1++;
$ucs1 = hex($u1);
$ucs2 = hex($u2);
push @mapping, {
code => $code,
ucs => $ucs1,
ucs_second => $ucs2,
comment => $rest,
direction => 'both'
};
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
@ -52,183 +51,31 @@ while ($line = <FILE>)
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR
"Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
$ucs, $code;
next;
}
$count++;
$array{$utf} = $code;
$comment{$code} = $rest;
}
close(FILE);
$file = "utf8_to_shift_jis_2004.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code,
$comment{$code};
}
else
{
printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code,
$comment{$code};
}
}
print FILE "};\n";
close(FILE);
$file = "utf8_to_shift_jis_2004_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {\n";
for $index (sort { $a cmp $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%s, 0x%s, 0x%04x} /* %s */\n", substr($index, 0, 8),
substr($index, 8, 8), $code, $comment1{$index};
}
else
{
printf FILE " {0x%s, 0x%s, 0x%04x}, /* %s */\n",
substr($index, 0, 8), substr($index, 8, 8), $code,
$comment1{$index};
}
}
print FILE "};\n";
close(FILE);
# then generate SHIFT_JIS_2004 --> UTF-8 table
$in_file = "sjis-0213-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
while ($line = <FILE>)
{
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u1 = $2;
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$code} = $str;
$comment1{$code} = $rest;
$count1++;
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u = $2;
$rest = "U+" . $u . $3;
}
else
if ($code < 0x80 && $ucs < 0x80)
{
next;
}
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
elsif ($code < 0x80)
{
printf STDERR
"Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
$ucs, $code;
printf STDERR "Previous value: UTF8: %08x\n", $array{$utf};
next;
$direction = 'from_unicode';
}
$count++;
$array{$code} = $utf;
$comment{$utf} = $rest;
}
close(FILE);
$file = "shift_jis_2004_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFTJIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
elsif ($ucs < 0x80)
{
printf FILE " {0x%04x, 0x%08x} /* %s */\n", $index, $code,
$comment{$code};
$direction = 'to_unicode';
}
else
{
printf FILE " {0x%04x, 0x%08x}, /* %s */\n", $index, $code,
$comment{$code};
$direction = 'both';
}
}
print FILE "};\n";
push @mapping, {
code => $code,
ucs => $ucs,
comment => $rest,
direction => $direction
};
}
close(FILE);
$file = "shift_jis_2004_to_utf8_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {\n";
for $index (sort { $a <=> $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%04x, 0x%s, 0x%s} /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
else
{
printf FILE " {0x%04x, 0x%s, 0x%s}, /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
}
print FILE "};\n";
close(FILE);
print_tables("SHIFT_JIS_2004", \@mapping, 1);

View File

@ -4,138 +4,45 @@
#
# src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
#
# Generate UTF-8 <--> SJIS code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain SHIFTJIS.TXT from
# the organization's ftp site.
#
# SHIFTJIS.TXT format:
# SHIFTJIS code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
# Warning: SHIFTJIS.TXT contains only JIS0201 and JIS0208. no JIS0212.
# Generate UTF-8 <=> SJIS code conversion radix tree Generate UTF-8
# <=> SJIS code conversion radix tree Unfortunately it is prohibited
# by the organization to distribute the map files. So if you try to
# use this script, you have to obtain CP932.TXT from the organization's
# ftp site.
require "ucs2utf.pl";
use strict;
require "convutils.pm";
# first generate UTF-8 --> SJIS table
my $charset = read_source("CP932.TXT");
$in_file = "CP932.TXT";
$count = 0;
# Drop these SJIS codes from the source for UTF8=>SJIS conversion
my @reject_sjis =(
0xed40..0xeefc, 0x8754..0x875d, 0x878a, 0x8782,
0x8784, 0xfa5b, 0xfa54, 0x8790..0x8792, 0x8795..0x8797,
0x879a..0x879c
);
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
foreach my $i (@$charset)
{
chop;
if (/^#/)
my $code = $i->{code};
my $ucs = $i->{ucs};
if (grep {$code == $_} @reject_sjis)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ((($code >= 0xed40) && ($code <= 0xeefc))
|| ( ($code >= 0x8754)
&& ($code <= 0x875d))
|| ($code == 0x878a)
|| ($code == 0x8782)
|| ($code == 0x8784)
|| ($code == 0xfa5b)
|| ($code == 0xfa54)
|| ( ($code >= 0x8790)
&& ($code <= 0x8792))
|| ( ($code >= 0x8795)
&& ($code <= 0x8797))
|| ( ($code >= 0x879a)
&& ($code <= 0x879c)))
{
printf STDERR "Warning: duplicate UTF8: UCS=0x%04x SJIS=0x%04x\n",
$ucs,
$code;
next;
}
$count++;
$array{$utf} = $code;
$i->{direction} = "to_unicode";
}
}
close(FILE);
# Add these UTF8->SJIS pairs to the table.
push @$charset, (
{direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'},
{direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'},
{direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'},
{direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'},
{direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'},
{direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'},
{direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'},
{direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'}
);
$file = "utf8_to_sjis.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapSJIS[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate SJIS --> UTF8 table
#
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
$count = 0;
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = "sjis_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapSJIS[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("SJIS", $charset);

View File

@ -0,0 +1,51 @@
#! /usr/bin/perl
#
# Copyright (c) 2007-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
# Generate UTF-8 <--> UHC code conversion tables from
# "windows-949-2000.xml", obtained from
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
#
# The lines we care about in the source file look like
# <a u="009A" b="81 30 83 36"/>
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for UHC
require "convutils.pm";
# Read the input
$in_file = "windows-949-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
my @mapping;
while (<FILE>)
{
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
$u = $1;
$c = $2;
$c =~ s/ //g;
$ucs = hex($u);
$code = hex($c);
next if ($code == 0x0080 || $code == 0x00FF);
if ($code >= 0x80 && $ucs >= 0x0080)
{
push @mapping, {
ucs => $ucs,
code => $code,
direction => 'both'
}
}
}
close(FILE);
# One extra character that's not in the source file.
push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' };
print_tables("UHC", \@mapping);

View File

@ -15,7 +15,7 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
%filename = (
'WIN866' => 'CP866.TXT',
@ -44,121 +44,13 @@ require "ucs2utf.pl";
'ISO8859_16' => '8859-16.TXT',
'KOI8R' => 'KOI8-R.TXT',
'KOI8U' => 'KOI8-U.TXT',
'GBK' => 'CP936.TXT',
'UHC' => 'CP949.TXT',
'JOHAB' => 'JOHAB.TXT',);
'GBK' => 'CP936.TXT');
@charsets = keys(%filename);
@charsets = @ARGV if scalar(@ARGV);
foreach $charset (@charsets)
{
my $mapping = &read_source($filename{$charset});
#
# first, generate UTF8-> charset table
#
$in_file = $filename{$charset};
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
}
}
close(FILE);
$file = lc("utf8_to_${charset}.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmap${charset}[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate character set code ->UTF8 table
#
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = lc("${charset}_to_utf8.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmap${charset}[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables($charset, $mapping);
}

View File

@ -0,0 +1,282 @@
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/convutils.pm
use strict;
#######################################################################
# convert UCS-4 to UTF-8
#
sub ucs2utf
{
my ($ucs) = @_;
my $utf;
if ($ucs <= 0x007f)
{
$utf = $ucs;
}
elsif ($ucs > 0x007f && $ucs <= 0x07ff)
{
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
}
elsif ($ucs > 0x07ff && $ucs <= 0xffff)
{
$utf =
((($ucs >> 12) | 0xe0) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
else
{
$utf =
((($ucs >> 18) | 0xf0) << 24) |
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
return ($utf);
}
#######################################################################
# read_source - common routine to read source file
#
# fname ; input file name
sub read_source
{
my ($fname) = @_;
my @r;
open(my $in, '<', $fname) || die("cannot open $fname");
while (<$in>)
{
next if (/^#/);
chop;
next if (/^$/); # Ignore empty lines
next if (/^0x([0-9A-F]+)\s+(#.*)$/);
# Skip the first column for JIS0208.TXT
if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
{
print STDERR "READ ERROR at line $. in $fname: $_\n";
exit;
}
my $out = {f => $fname, l => $.,
code => hex($1),
ucs => hex($2),
comment => $4,
direction => "both"
};
# Ignore pure ASCII mappings. PostgreSQL character conversion code
# never even passes these to the conversion code.
next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
push(@r, $out);
}
close($in);
return \@r;
}
##################################################################
# print_tables : output mapping tables
#
# Arguments:
# charset - string name of the character set.
# table - mapping table (see format below)
# verbose - if 1, output comment on each line,
# if 2, also output source file name and number
#
#
#
# Mapping table format:
#
# Mapping table is a list of hashes. Each hash has the following fields:
# direction - Direction: 'both', 'from_unicode' or 'to_unicode'
# ucs - Unicode code point
# ucs_second - Second Unicode code point, if this is a "combined" character.
# code - Byte sequence in the "other" character set, as an integer
# comment - Text representation of the character
# f - Source filename
# l - Line number in source file
#
#
sub print_tables
{
my ($charset, $table, $verbose) = @_;
# Build an array with only the to-UTF8 direction mappings
my @to_unicode;
my @to_unicode_combined;
my @from_unicode;
my @from_unicode_combined;
foreach my $i (@$table)
{
if (defined $i->{ucs_second})
{
my $entry = {utf8 => ucs2utf($i->{ucs}),
utf8_second => ucs2utf($i->{ucs_second}),
code => $i->{code},
comment => $i->{comment},
f => $i->{f}, l => $i->{l}};
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
{
push @to_unicode_combined, $entry;
}
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
{
push @from_unicode_combined, $entry;
}
}
else
{
my $entry = {utf8 => ucs2utf($i->{ucs}),
code => $i->{code},
comment => $i->{comment},
f => $i->{f}, l => $i->{l}};
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
{
push @to_unicode, $entry;
}
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
{
push @from_unicode, $entry;
}
}
}
print_to_utf8_map($charset, \@to_unicode, $verbose);
print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
print_from_utf8_map($charset, \@from_unicode, $verbose);
print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
}
sub print_from_utf8_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("utf8_to_${charset}.map");
print "- Writing UTF8=>${charset} conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
"static const pg_utf_to_local ULmap${charset}[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
if ($verbose >= 2)
{
$last_comment = "$$i{f}:$$i{l} $$i{comment}";
}
else
{
$last_comment = $$i{comment};
}
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
sub print_from_utf8_combined_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("utf8_to_${charset}_combined.map");
print "- Writing UTF8=>${charset} conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
$last_comment = "$$i{comment}";
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
sub print_to_utf8_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("${charset}_to_utf8.map");
print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
"static const pg_local_to_utf LUmap${charset}[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
if ($verbose >= 2)
{
$last_comment = "$$i{f}:$$i{l} $$i{comment}";
}
else
{
$last_comment = $$i{comment};
}
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
sub print_to_utf8_combined_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("${charset}_to_utf8_combined.map");
print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
$last_comment = "$$i{comment}";
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
1;

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map */
static const pg_local_to_utf LUmapEUC_JIS_2004[ 11303 ] = { /* */
{0x0080, 0xc280}, /* U+0080 <control> */
{0x0081, 0xc281}, /* U+0081 <control> */
{0x0082, 0xc282}, /* U+0082 <control> */
@ -205,7 +204,7 @@ static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
{0xa2ac, 0xe28691}, /* U+2191 UPWARDS ARROW */
{0xa2ad, 0xe28693}, /* U+2193 DOWNWARDS ARROW */
{0xa2ae, 0xe38093}, /* U+3013 GETA MARK */
{0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xa2b0, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */
{0xa2b1, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */
{0xa2b2, 0xefbd9e}, /* U+FF5E FULLWIDTH TILDE [2000] */

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map */
static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[ 25 ] = { /* */
{0xa4f7, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */
{0xa4f8, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */
{0xa4f9, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */

View File

@ -1,6 +1,6 @@
/* src/backend/utils/mb/Unicode/euc_jp_to_utf8.map */
static const pg_local_to_utf LUmapEUC_JP[] = {
static const pg_local_to_utf LUmapEUC_JP[ 13197 ] = {
{0x8ea1, 0xefbda1},
{0x8ea2, 0xefbda2},
{0x8ea3, 0xefbda3},
@ -13197,5 +13197,5 @@ static const pg_local_to_utf LUmapEUC_JP[] = {
{0x8ff4fb, 0xe9ab99},
{0x8ff4fc, 0xe9adb2},
{0x8ff4fd, 0xefa8ad},
{0x8ff4fe, 0xe9bb91},
{0x8ff4fe, 0xe9bb91}
};

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/euc_kr_to_utf8.map */
static const pg_local_to_utf LUmapEUC_KR[ 8227 ] = {
{0xa1a1, 0xe38080},
{0xa1a2, 0xe38081},

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/johab_to_utf8.map */
static const pg_local_to_utf LUmapJOHAB[ 17049 ] = {
{0x8444, 0xe384b3},
{0x8446, 0xe384b5},

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_SHIFTJIS_2004.pl
*/
static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map */
static const pg_local_to_utf LUmapSHIFT_JIS_2004[ 11271 ] = { /* */
{0x00a1, 0xefbda1}, /* U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP */
{0x00a2, 0xefbda2}, /* U+FF62 HALFWIDTH LEFT CORNER BRACKET */
{0x00a3, 0xefbda3}, /* U+FF63 HALFWIDTH RIGHT CORNER BRACKET */
@ -173,7 +172,7 @@ static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
{0x81aa, 0xe28691}, /* U+2191 UPWARDS ARROW */
{0x81ab, 0xe28693}, /* U+2193 DOWNWARDS ARROW */
{0x81ac, 0xe38093}, /* U+3013 GETA MARK */
{0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0x81ae, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */
{0x81af, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */
{0x81b0, 0x7e}, /* U+007E TILDE [2000] Fullwidth: U+FF5E */

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_SHIFT_JIS_2004.pl
*/
static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map */
static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[ 25 ] = { /* */
{0x82f5, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */
{0x82f6, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */
{0x82f7, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */

View File

@ -1,35 +0,0 @@
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/ucs2utf.pl
# convert UCS-4 to UTF-8
#
sub ucs2utf
{
local ($ucs) = @_;
local $utf;
if ($ucs <= 0x007f)
{
$utf = $ucs;
}
elsif ($ucs > 0x007f && $ucs <= 0x07ff)
{
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
}
elsif ($ucs > 0x07ff && $ucs <= 0xffff)
{
$utf =
((($ucs >> 12) | 0xe0) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
else
{
$utf =
((($ucs >> 18) | 0xf0) << 24) |
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
return ($utf);
}
1;

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/uhc_to_utf8.map */
static const pg_local_to_utf LUmapUHC[ 17237 ] = {
{0x8141, 0xeab082},
{0x8142, 0xeab083},

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/utf8_to_euc_cn.map */
static const pg_utf_to_local ULmapEUC_CN[ 7445 ] = {
{0xc2a4, 0xa1e8},
{0xc2a7, 0xa1ec},

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map */
static const pg_utf_to_local ULmapEUC_JIS_2004[ 11303 ] = { /* */
{0xc280, 0x0080}, /* U+0080 <control> */
{0xc281, 0x0081}, /* U+0081 <control> */
{0xc282, 0x0082}, /* U+0082 <control> */
@ -10849,7 +10848,7 @@ static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
{0xefbc84, 0xa1f0}, /* U+FF04 FULLWIDTH DOLLAR SIGN */
{0xefbc85, 0xa1f3}, /* U+FF05 FULLWIDTH PERCENT SIGN */
{0xefbc86, 0xa1f5}, /* U+FF06 FULLWIDTH AMPERSAND */
{0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xefbc88, 0xa1ca}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */
{0xefbc89, 0xa1cb}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */
{0xefbc8a, 0xa1f6}, /* U+FF0A FULLWIDTH ASTERISK */

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map */
static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[ 25 ] = { /* */
{0x0000c3a6, 0x0000cc80, 0xabc4}, /* U+00E6+0300 [2000] */
{0x0000c994, 0x0000cc80, 0xabc8}, /* U+0254+0300 [2000] */
{0x0000c994, 0x0000cc81, 0xabc9}, /* U+0254+0301 [2000] */

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/utf8_to_euc_jp.map */
static const pg_utf_to_local ULmapEUC_JP[ 13175 ] = {
{0xc2a1, 0x8fa2c2},
{0xc2a4, 0x8fa2f0},

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/utf8_to_euc_kr.map */
static const pg_utf_to_local ULmapEUC_KR[ 8227 ] = {
{0xc2a1, 0xa2ae},
{0xc2a4, 0xa2b4},

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/utf8_to_johab.map */
static const pg_utf_to_local ULmapJOHAB[ 17049 ] = {
{0xc2a1, 0xd9ae},
{0xc2a4, 0xd9b4},

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_SHIFT_JIS_2004.pl
*/
static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map */
static const pg_utf_to_local ULmapSHIFT_JIS_2004[ 11271 ] = { /* */
{0xc2a0, 0x8541}, /* U+00A0 NO-BREAK SPACE [2000] */
{0xc2a1, 0x8542}, /* U+00A1 INVERTED EXCLAMATION MARK [2000] */
{0xc2a2, 0x8191}, /* U+00A2 CENT SIGN Windows: U+FFE0 */
@ -10817,7 +10816,7 @@ static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
{0xefbc84, 0x8190}, /* U+FF04 FULLWIDTH DOLLAR SIGN */
{0xefbc85, 0x8193}, /* U+FF05 FULLWIDTH PERCENT SIGN */
{0xefbc86, 0x8195}, /* U+FF06 FULLWIDTH AMPERSAND */
{0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xefbc88, 0x8169}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */
{0xefbc89, 0x816a}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */
{0xefbc8a, 0x8196}, /* U+FF0A FULLWIDTH ASTERISK */

View File

@ -1,7 +1,6 @@
/*
* This file was generated by UCS_to_SHIFT_JIS_2004.pl
*/
static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map */
static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[ 25 ] = { /* */
{0x0000c3a6, 0x0000cc80, 0x8663}, /* U+00E6+0300 [2000] */
{0x0000c994, 0x0000cc80, 0x8667}, /* U+0254+0300 [2000] */
{0x0000c994, 0x0000cc81, 0x8668}, /* U+0254+0301 [2000] */

View File

@ -3,7 +3,7 @@
static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
{0xc2a2, 0x8191},
{0xc2a3, 0x8192},
{0xc2a5, 0x5c},
{0xc2a5, 0x005c},
{0xc2a7, 0x8198},
{0xc2a8, 0x814e},
{0xc2ac, 0x81ca},
@ -142,7 +142,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
{0xe280b2, 0x818c},
{0xe280b3, 0x818d},
{0xe280bb, 0x81a6},
{0xe280be, 0x7e},
{0xe280be, 0x007e},
{0xe28483, 0x818e},
{0xe28496, 0xfa59},
{0xe284a1, 0xfa5a},

View File

@ -1,3 +1,5 @@
/* src/backend/utils/mb/Unicode/utf8_to_uhc.map */
static const pg_utf_to_local ULmapUHC[ 17237 ] = {
{0xc2a1, 0xa2ae},
{0xc2a4, 0xa2b4},