From 1de9cc0dcca649d1900720924f4ea5c430d1a51e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Nov 2016 14:54:02 +0200 Subject: [PATCH] Rewrite the perl scripts to produce our Unicode conversion tables. Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no longer available. Get UHC from windows-949-2000.xml, it's more up-to-date. Plus tons more small changes. With these changes, the perl scripts faithfully produce the *.map files we have in the repository, from the external source files. In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT. Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson. Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi --- src/backend/utils/mb/Unicode/Makefile | 22 +- src/backend/utils/mb/Unicode/UCS_to_BIG5.pl | 184 +------- src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl | 162 +++---- .../utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl | 274 +----------- src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl | 414 ++++++++---------- src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl | 115 +---- src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl | 152 ++----- .../utils/mb/Unicode/UCS_to_GB18030.pl | 80 +--- src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl | 31 ++ .../utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl | 205 ++------- src/backend/utils/mb/Unicode/UCS_to_SJIS.pl | 157 ++----- src/backend/utils/mb/Unicode/UCS_to_UHC.pl | 51 +++ src/backend/utils/mb/Unicode/UCS_to_most.pl | 116 +---- src/backend/utils/mb/Unicode/convutils.pm | 282 ++++++++++++ .../utils/mb/Unicode/euc_jis_2004_to_utf8.map | 9 +- .../Unicode/euc_jis_2004_to_utf8_combined.map | 7 +- .../utils/mb/Unicode/euc_jp_to_utf8.map | 4 +- .../utils/mb/Unicode/euc_kr_to_utf8.map | 2 + .../utils/mb/Unicode/johab_to_utf8.map | 2 + .../mb/Unicode/shift_jis_2004_to_utf8.map | 9 +- .../shift_jis_2004_to_utf8_combined.map | 7 +- src/backend/utils/mb/Unicode/ucs2utf.pl | 35 -- src/backend/utils/mb/Unicode/uhc_to_utf8.map | 2 + .../utils/mb/Unicode/utf8_to_euc_cn.map | 2 + .../utils/mb/Unicode/utf8_to_euc_jis_2004.map | 9 +- .../Unicode/utf8_to_euc_jis_2004_combined.map | 7 +- .../utils/mb/Unicode/utf8_to_euc_jp.map | 2 + .../utils/mb/Unicode/utf8_to_euc_kr.map | 2 + .../utils/mb/Unicode/utf8_to_johab.map | 2 + .../mb/Unicode/utf8_to_shift_jis_2004.map | 9 +- .../utf8_to_shift_jis_2004_combined.map | 7 +- src/backend/utils/mb/Unicode/utf8_to_sjis.map | 4 +- src/backend/utils/mb/Unicode/utf8_to_uhc.map | 2 + 33 files changed, 809 insertions(+), 1559 deletions(-) create mode 100755 src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl create mode 100755 src/backend/utils/mb/Unicode/UCS_to_UHC.pl create mode 100644 src/backend/utils/mb/Unicode/convutils.pm delete mode 100644 src/backend/utils/mb/Unicode/ucs2utf.pl diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index 9d2ef5e3d2..ea21f4a852 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \ win1258_to_utf8.map utf8_to_win1258.map GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \ - johab_to_utf8.map utf8_to_johab.map \ - uhc_to_utf8.map utf8_to_uhc.map \ gbk_to_utf8.map utf8_to_gbk.map \ koi8r_to_utf8.map utf8_to_koi8r.map @@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \ sjis_to_utf8.map utf8_to_sjis.map \ gb18030_to_utf8.map utf8_to_gb18030.map \ big5_to_utf8.map utf8_to_big5.map \ + johab_to_utf8.map utf8_to_johab.map \ + uhc_to_utf8.map utf8_to_uhc.map \ euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \ utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \ shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \ @@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \ 8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \ 8859-16.TXT -WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \ +WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \ CP1250.TXT CP1251.TXT \ CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \ CP1256.TXT CP1257.TXT CP1258.TXT GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \ - KOI8-R.TXT KOI8-U.TXT JOHAB.TXT + KOI8-R.TXT KOI8-U.TXT all: $(MAPS) $(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS) $(PERL) $< -euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT +johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT $(PERL) $< -euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT +uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml + $(PERL) $< + +euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT + $(PERL) $< + +euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml $(PERL) $< euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT @@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT: euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) -gb-18030-2000.xml: +gb-18030-2000.xml windows-949-2000.xml: $(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F) GB2312.TXT: @@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT: $(ISO8859TEXTS): $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F) -$(filter-out CP8%,$(WINTEXTS)): +$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT: $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F) $(filter CP8%,$(WINTEXTS)): diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl index 127fd157b0..6a1321bab8 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl @@ -25,56 +25,17 @@ # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; +# Load BIG5.TXT +my $all = &read_source("BIG5.TXT"); -# -# first, generate UTF8 --> BIG5 table -# -$in_file = "BIG5.TXT"; +# Load CP950.TXT +my $cp950txt = &read_source("CP950.TXT"); -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - } -} -close(FILE); - -$in_file = "CP950.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); +foreach my $i (@$cp950txt) { + my $code = $i->{code}; + my $ucs = $i->{ucs}; # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc # from CP950.TXT @@ -83,126 +44,25 @@ while () && $code >= 0xf9d6 && $code <= 0xf9dc) { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - } -} -close(FILE); - -$file = lc("utf8_to_big5.map"); -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + push @$all, {code => $code, + ucs => $ucs, + comment => $i->{comment}, + direction => "both"}; } } -print FILE "};\n"; -close(FILE); +foreach my $i (@$all) { + my $code = $i->{code}; + my $ucs = $i->{ucs}; -# -# then generate BIG5 --> UTF8 table -# -$in_file = "BIG5.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) + # BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can + # contain only one of them. XXX: Doesn't really make sense to include any of them, + # but for historical reasons, we map the first one of them. + if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A) { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$code} = $utf; - } -} -close(FILE); - -$in_file = "CP950.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - - # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc - # from CP950.TXT - if ( $code >= 0x80 - && $ucs >= 0x0080 - && $code >= 0xf9d6 - && $code <= 0xf9dc) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$code} = $utf; - } -} -close(FILE); - -$file = lc("big5_to_utf8.map"); -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; + $i->{direction} = "to_unicode"; } } -print FILE "};\n"; -close(FILE); +# Output +print_tables("BIG5", $all); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl index 53f44773c9..8df23f8be6 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl @@ -1,128 +1,76 @@ #! /usr/bin/perl # -# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# Copyright (c) 2007-2016, PostgreSQL Global Development Group # -# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # -# Generate UTF-8 <--> EUC_CN code conversion tables from -# map files provided by Unicode organization. -# Unfortunately it is prohibited by the organization -# to distribute the map files. So if you try to use this script, -# you have to obtain GB2312.TXT from -# the organization's ftp site. +# Generate UTF-8 <--> GB18030 code conversion tables from +# "gb-18030-2000.xml", obtained from +# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ # -# GB2312.TXT format: -# GB2312 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) +# The lines we care about in the source file look like +# +# where the "u" field is the Unicode code point in hex, +# and the "b" field is the hex byte sequence for GB18030 -require "ucs2utf.pl"; +require "convutils.pm"; -# first generate UTF-8 --> EUC_CN table +# Read the input -$in_file = "GB2312.TXT"; +$in_file = "gb-18030-2000.xml"; open(FILE, $in_file) || die("cannot open $in_file"); -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8080); - } -} -close(FILE); - -$file = "utf8_to_euc_cn.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - -# -# then generate EUC_CN --> UTF8 table -# -reset 'array'; - -open(FILE, $in_file) || die("cannot open $in_file"); +my @mapping; while () { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; + next if (!m/= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - $code |= 0x8080; - $array{$code} = $utf; + # The GB-18030 character set, which we use as the source, contains + # a lot of extra characters on top of the GB2312 character set that + # EUC_CN encodes. Filter out those extra characters. + next if (($code & 0xFF) < 0xA1); + next if (!($code >= 0xA100 && $code <= 0xA9FF || + $code >= 0xB000 && $code <= 0xF7FF)); + + next if ($code >= 0xA2A1 && $code <= 0xA2B0); + next if ($code >= 0xA2E3 && $code <= 0xA2E4); + next if ($code >= 0xA2EF && $code <= 0xA2F0); + next if ($code >= 0xA2FD && $code <= 0xA2FE); + next if ($code >= 0xA4F4 && $code <= 0xA4FE); + next if ($code >= 0xA5F7 && $code <= 0xA5FE); + next if ($code >= 0xA6B9 && $code <= 0xA6C0); + next if ($code >= 0xA6D9 && $code <= 0xA6FE); + next if ($code >= 0xA7C2 && $code <= 0xA7D0); + next if ($code >= 0xA7F2 && $code <= 0xA7FE); + next if ($code >= 0xA8BB && $code <= 0xA8C4); + next if ($code >= 0xA8EA && $code <= 0xA8FE); + next if ($code >= 0xA9A1 && $code <= 0xA9A3); + next if ($code >= 0xA9F0 && $code <= 0xA9FE); + next if ($code >= 0xD7FA && $code <= 0xD7FE); + + # A couple of characters are mapped differently from GB-2312 or GB-18030 + if ($code == 0xA1A4) + { + $ucs = 0x30FB; + } + if ($code == 0xA1AA) + { + $ucs = 0x2015; + } + + push @mapping, { + ucs => $ucs, + code => $code, + direction => 'both' } } close(FILE); -$file = "euc_cn_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_CN", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl index d2f1b757cb..b4e140b657 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl @@ -7,9 +7,7 @@ # Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from # "euc-jis-2004-std.txt" (http://x0213.org) -require "ucs2utf.pl"; - -$TEST = 0; +require "convutils.pm"; # first generate UTF-8 --> EUC_JIS_2004 table @@ -17,10 +15,7 @@ $in_file = "euc-jis-2004-std.txt"; open(FILE, $in_file) || die("cannot open $in_file"); -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; +my @all; while ($line = ) { @@ -31,14 +26,14 @@ while ($line = ) $u2 = $3; $rest = "U+" . $u1 . "+" . $u2 . $4; $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$str} = $code; - $comment1{$str} = $rest; - $count1++; + $ucs1 = hex($u1); + $ucs2 = hex($u2); + + push @all, { direction => 'both', + ucs => $ucs1, + ucs_second => $ucs2, + code => $code, + comment => $rest }; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) @@ -54,252 +49,11 @@ while ($line = ) $ucs = hex($u); $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - $comment{$code} = $rest; + next if ($code < 0x80 && $ucs < 0x80); + + push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest }; } close(FILE); -$file = "utf8_to_euc_jis_2004.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_utf_to_local ULmapEUC_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code, - $comment{$code}; - } - else - { - printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code, - $comment{$code}; - } -} - -print FILE "};\n"; -close(FILE); - -if ($TEST == 1) -{ - $file1 = "utf8.data"; - $file2 = "euc_jis_2004.data"; - open(FILE1, "> $file1") || die("cannot open $file1"); - open(FILE2, "> $file2") || die("cannot open $file2"); - - for $index (sort { $a <=> $b } keys(%array)) - { - $code = $array{$index}; - if ( $code > 0x00 - && $code != 0x09 - && $code != 0x0a - && $code != 0x0d - && $code != 0x5c - && ( $code < 0x80 - || ($code >= 0x8ea1 && $code <= 0x8efe) - || ($code >= 0x8fa1a1 && $code <= 0x8ffefe) - || ($code >= 0xa1a1 && $code <= 0x8fefe))) - { - for ($i = 3; $i >= 0; $i--) - { - $s = $i * 8; - $mask = 0xff << $s; - print FILE1 pack("C", ($index & $mask) >> $s) - if $index & $mask; - print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask; - } - print FILE1 "\n"; - print FILE2 "\n"; - } - } -} - -$file = "utf8_to_euc_jis_2004_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE - "static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n"; - -for $index (sort { $a cmp $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8), - substr($index, 8, 8), $code, $comment1{$index}; - } - else - { - printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n", - substr($index, 0, 8), substr($index, 8, 8), $code, - $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); - -if ($TEST == 1) -{ - for $index (sort { $a cmp $b } keys(%array1)) - { - $code = $array1{$index}; - if ( $code > 0x00 - && $code != 0x09 - && $code != 0x0a - && $code != 0x0d - && $code != 0x5c - && ( $code < 0x80 - || ($code >= 0x8ea1 && $code <= 0x8efe) - || ($code >= 0x8fa1a1 && $code <= 0x8ffefe) - || ($code >= 0xa1a1 && $code <= 0x8fefe))) - { - - $v1 = hex(substr($index, 0, 8)); - $v2 = hex(substr($index, 8, 8)); - - for ($i = 3; $i >= 0; $i--) - { - $s = $i * 8; - $mask = 0xff << $s; - print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask; - print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask; - } - for ($i = 3; $i >= 0; $i--) - { - $s = $i * 8; - $mask = 0xff << $s; - print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask; - } - print FILE1 "\n"; - print FILE2 "\n"; - } - } - close(FILE1); - close(FILE2); -} - -# then generate EUC_JIS_2004 --> UTF-8 table - -$in_file = "euc-jis-2004-std.txt"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; - -while ($line = ) -{ - if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u1 = $2; - $u2 = $3; - $rest = "U+" . $u1 . "+" . $u2 . $4; - $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$code} = $str; - $comment1{$code} = $rest; - $count1++; - next; - } - elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u = $2; - $rest = "U+" . $u . $3; - } - else - { - next; - } - - $ucs = hex($u); - $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$code} = $utf; - $comment{$utf} = $rest; -} -close(FILE); - -$file = "euc_jis_2004_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_local_to_utf LUmapEUC_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code, - $comment{$code}; - } - else - { - printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code, - $comment{$code}; - } -} - -print FILE "};\n"; -close(FILE); - -$file = "euc_jis_2004_to_utf8_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE - "static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } - else - { - printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_JIS_2004", \@all, 1); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl index 055fc849ba..0e9dd292bf 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl @@ -8,275 +8,223 @@ # map files provided by Unicode organization. # Unfortunately it is prohibited by the organization # to distribute the map files. So if you try to use this script, -# you have to obtain JIS0201.TXT, JIS0208.TXT, JIS0212.TXT from -# the organization's ftp site. -# -# JIS0201.TXT format: -# JIS0201 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# -# JIS0208.TXT format: -# JIS0208 shift-JIS code in hex -# JIS0208 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# -# JIS0212.TXT format: -# JIS0212 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) +# you have to obtain CP932.TXT and JIS0212.TXT from the +# organization's ftp site. -require "ucs2utf.pl"; +use strict; +require "convutils.pm"; -# first generate UTF-8 --> EUC_JP table +# Load JIS0212.TXT +my $jis0212 = &read_source("JIS0212.TXT"); -# -# JIS0201 -# -$in_file = "JIS0201.TXT"; +my @mapping; -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) +foreach my $i (@$jis0212) { + # We have a different mapping for this in the EUC_JP to UTF-8 direction. + if ($i->{code} == 0x2243) { - next; + $i->{direction} = "from_unicode"; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - # add single shift 2 - $array{$utf} = ($code | 0x8e00); + if ($i->{code} == 0x2271) + { + $i->{direction} = "to_unicode"; } -} -close(FILE); -# -# JIS0208 -# -$in_file = "JIS0208.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) + if ($i->{ucs} >= 0x080) { - next; - } - ($s, $c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8080); - } -} -close(FILE); - -# -# JIS0212 -# -$in_file = "JIS0212.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8f8080); - } -} -close(FILE); - -$file = "utf8_to_euc_jp.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_JP[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; + $i->{code} = $i->{code} | 0x8f8080; } else { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + next; } + + push @mapping, $i; } -print FILE "};\n"; -close(FILE); +# Load CP932.TXT. +my $ct932 = &read_source("CP932.TXT"); -# -# then generate EUC_JP --> UTF8 table -# +foreach my $i (@$ct932) { + my $sjis = $i->{code}; -# -# JIS0201 -# -$in_file = "JIS0201.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) + # We have a different mapping for this in the EUC_JP to UTF-8 direction. + if ($sjis == 0xeefa || + $sjis == 0xeefb || + $sjis == 0xeefc) { next; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - # add single shift 2 - $code |= 0x8e00; - $array{$code} = $utf; + if ($sjis >= 0xa1) + { + my $jis = &sjis2jis($sjis); + + $i->{code} = $jis | ($jis < 0x100 ? 0x8e00 : + ($sjis >= 0xeffd ? 0x8f8080 : 0x8080)); + + # Remember the SJIS code for later. + $i->{sjis} = $sjis; + + push @mapping, $i; } } -close(FILE); -# -# JIS0208 -# -$in_file = "JIS0208.TXT"; +foreach my $i (@mapping) { + my $sjis = $i->{sjis}; -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) + # These SJIS characters are excluded completely. + if ($sjis >= 0xed00 && $sjis <= 0xeef9 || + $sjis >= 0xfa54 && $sjis <= 0xfa56 || + $sjis >= 0xfa58 && $sjis <= 0xfc4b) { + $i->{direction} = "none"; next; } - ($s, $c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - - $code |= 0x8080; - $array{$code} = $utf; - } -} -close(FILE); - -# -# JIS0212 -# -$in_file = "JIS0212.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) + + # These SJIS characters are only in the UTF-8 to EUC_JP table + if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc) { + $i->{direction} = "from_unicode"; next; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - $code |= 0x8f8080; - $array{$code} = $utf; + if ($sjis == 0x8790 || $sjis == 0x8791 || $sjis == 0x8792 || + $sjis == 0x8795 || $sjis == 0x8796 || $sjis == 0x8797 || + $sjis == 0x879a || $sjis == 0x879b || $sjis == 0x879c || + ($sjis >= 0xfa4a && $sjis <= 0xfa53)) + { + $i->{direction} = "to_unicode"; + next; } } -close(FILE); -$file = "euc_jp_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); +push @mapping, ( + {direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)'}, + {direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)'}, + {direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)'}, + {direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)'}, + {direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)'}, + {direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)'}, + {direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)'}, + {direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)'}, + {direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)'}, + {direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)'}, + {direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)'}, + {direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)'}, + {direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)'}, + {direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)'}, + {direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)'}, + {direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)'}, + {direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)'}, + {direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)'}, + {direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)'}, + {direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)'}, + {direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)'}, + {direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)'}, + {direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)'}, + {direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)'}, + {direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)'}, + {direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)'}, + {direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)'}, + {direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)'}, + {direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)'}, + {direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)'}, + {direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)'}, + {direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)'}, + {direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)'}, + {direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)'}, + {direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)'}, + {direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)'}, + {direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)'}, + {direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)'}, + {direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)'}, + {direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)'}, + {direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)'}, + {direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)'}, + {direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)'}, + {direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)'}, + {direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)'}, + {direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)'}, + {direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)'}, + {direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'}, + {direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'}, + {direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'}, + {direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'}, + {direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'}, + {direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'}, + {direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'}, + {direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'}, + {direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'}, + {direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'}, + {direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'}, + {direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'}, + {direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'}, + {direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'}, + {direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'}, + {direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'}, + {direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'}, + {direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'}, + {direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'}, + {direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'}, + {direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'}, + {direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'}, + {direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'}, + {direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'}, + {direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'}, + {direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'}, + {direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'}, + {direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'}, + {direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'}, + {direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'}, + {direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'}, + {direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'}, + {direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'}, + {direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'}, + {direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE'}, + {direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR'}, -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_JP[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) + # additional conversions for EUC_JP -> UTF-8 conversion + {direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN'}, + {direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN'}, + {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'} + ); + +print_tables("EUC_JP", \@mapping); + +####################################################################### +# sjis2jis ; SJIS => JIS conversion +sub sjis2jis { - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} + my ($sjis) = @_; -print FILE "};\n"; -close(FILE); + return $sjis if ($sjis <= 0x100); + + my $hi = $sjis >> 8; + my $lo = $sjis & 0xff; + + if ($lo >= 0x80) { $lo--; } + $lo -= 0x40; + if ($hi >= 0xe0) { $hi -= 0x40; } + $hi -= 0x81; + my $pos = $lo + $hi * 0xbc; + + if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b) + { + # This region (115-ku) is out of range of JIS code but for + # convenient to generate code in EUC CODESET 3, move this to + # seemingly duplicate region (83-84-ku). + $pos = $pos - ((31 * 0x5e) + 12); + + # after 85-ku 82-ten needs to be moved 2 codepoints + $pos = $pos - 2 if ($pos >= 84 * 0x5c + 82) + } + + my $hi2 = $pos / 0x5e; + my $lo2 = ($pos % 0x5e); + + my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8); + + return $ret; +} diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl index a7c94bca91..a917d06717 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl @@ -16,113 +16,22 @@ # UCS-2 code in hex # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; -# first generate UTF-8 --> EUC_KR table +# Load the source file. -$in_file = "KSX1001.TXT"; +my $mapping = &read_source("KSX1001.TXT"); -open(FILE, $in_file) || die("cannot open $in_file"); - -while () +foreach my $i (@$mapping) { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8080); - } -} -close(FILE); - -$file = "utf8_to_euc_kr.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_KR[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } + $i->{code} = $i->{code} | 0x8080; } -print FILE "};\n"; -close(FILE); +# Some extra characters that are not in KSX1001.TXT +push @$mapping, ( + {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'}, + {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'}, + {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'} + ); -# -# then generate EUC_KR --> UTF8 table -# -reset 'array'; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - - $code |= 0x8080; - $array{$code} = $utf; - } -} -close(FILE); - -$file = "euc_kr_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_KR[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_KR", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl index e4fc535b18..aceef5433c 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl @@ -17,141 +17,47 @@ # UCS-2 code in hex # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; -# first generate UTF-8 --> EUC_TW table +my $mapping = &read_source("CNS11643.TXT"); -$in_file = "CNS11643.TXT"; +my @extras; -open(FILE, $in_file) || die("cannot open $in_file"); - -while () +foreach my $i (@$mapping) { - chop; - if (/^#/) + my $ucs = $i->{ucs}; + my $code = $i->{code}; + my $origcode = $i->{code}; + + my $plane = ($code & 0x1f0000) >> 16; + if ($plane > 16) { + printf STDERR "Warning: invalid plane No.$plane. ignored\n"; next; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) + + if ($plane == 1) { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $plane = ($code & 0x1f0000) >> 16; - if ($plane > 16) - { - printf STDERR "Warning: invalid plane No.$plane. ignored\n"; - next; - } - - if ($plane == 1) - { - $array{$utf} = (($code & 0xffff) | 0x8080); - } - else - { - $array{$utf} = - (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080); - } - } -} -close(FILE); - -$file = "utf8_to_euc_tw.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_TW[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; + $code = ($code & 0xffff) | 0x8080; } else { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + $code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080); + } + $i->{code} = $code; + + # Some codes are mapped twice in the EUC_TW to UTF-8 table. + if ($origcode >= 0x12121 && $origcode <= 0x20000) + { + push @extras, { + ucs => $i->{ucs}, + code => ($i->{code} + 0x8ea10000), + rest => $i->{rest}, + direction => 'to_unicode' + } } } -print FILE "};\n"; -close(FILE); +push @$mapping, @extras; -# -# then generate EUC_TW --> UTF8 table -# -reset 'array'; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - - $plane = ($code & 0x1f0000) >> 16; - if ($plane > 16) - { - printf STDERR "Warning: invalid plane No.$plane. ignored\n"; - next; - } - - if ($plane == 1) - { - $c = (($code & 0xffff) | 0x8080); - $array{$c} = $utf; - $count++; - } - $c = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080); - $array{$c} = $utf; - } -} -close(FILE); - -$file = "euc_tw_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_TW[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_TW", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index 043c1c27ec..f58361024e 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -13,8 +13,7 @@ # where the "u" field is the Unicode code point in hex, # and the "b" field is the hex byte sequence for GB18030 -require "ucs2utf.pl"; - +require "convutils.pm"; # Read the input @@ -22,6 +21,8 @@ $in_file = "gb-18030-2000.xml"; open(FILE, $in_file) || die("cannot open $in_file"); +my @mapping; + while () { next if (!m/) $code = hex($c); if ($code >= 0x80 && $ucs >= 0x0080) { - $utf = &ucs2utf($ucs); - if ($arrayu{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; + push @mapping, { + ucs => $ucs, + code => $code, + direction => 'both' } - if ($arrayc{$code} ne "") - { - printf STDERR "Warning: duplicate GB18030: %08x\n", $code; - next; - } - $arrayu{$utf} = $code; - $arrayc{$code} = $utf; - $count++; } } close(FILE); - -# -# first, generate UTF8 --> GB18030 table -# - -$file = "utf8_to_gb18030.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n"; - -$cc = $count; -for $index (sort { $a <=> $b } keys(%arrayu)) -{ - $code = $arrayu{$index}; - $cc--; - if ($cc == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - - -# -# then generate GB18030 --> UTF8 table -# - -$file = "gb18030_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n"; - -$cc = $count; -for $index (sort { $a <=> $b } keys(%arrayc)) -{ - $utf = $arrayc{$index}; - $cc--; - if ($cc == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("GB18030", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl new file mode 100755 index 0000000000..b98f9a7bf5 --- /dev/null +++ b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl @@ -0,0 +1,31 @@ +#! /usr/bin/perl +# +# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl +# +# Generate UTF-8 <--> JOHAB conversion tables from +# map files provided by Unicode organization. +# Unfortunately it is prohibited by the organization +# to distribute the map files. So if you try to use this script, +# you have to obtain the map files from the organization's ftp site. +# ftp://www.unicode.org/Public/MAPPINGS/ +# We assume the file include three tab-separated columns: +# JOHAB code in hex +# UCS-2 code in hex +# # and Unicode name (not used in this script) + +require "convutils.pm"; + +# Load the source file. + +my $mapping = &read_source("JOHAB.TXT"); + +# Some extra characters that are not in JOHAB.TXT +push @$mapping, ( + {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'}, + {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'}, + {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'} + ); + +print_tables("JOHAB", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl index 51ffd86b2c..16a53ad1d9 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl @@ -7,7 +7,7 @@ # Generate UTF-8 <--> SHIFT_JIS_2004 code conversion tables from # "sjis-0213-2004-std.txt" (http://x0213.org) -require "ucs2utf.pl"; +require "convutils.pm"; # first generate UTF-8 --> SHIFT_JIS_2004 table @@ -15,10 +15,7 @@ $in_file = "sjis-0213-2004-std.txt"; open(FILE, $in_file) || die("cannot open $in_file"); -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; +my @mapping; while ($line = ) { @@ -29,14 +26,16 @@ while ($line = ) $u2 = $3; $rest = "U+" . $u1 . "+" . $u2 . $4; $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$str} = $code; - $comment1{$str} = $rest; - $count1++; + $ucs1 = hex($u1); + $ucs2 = hex($u2); + + push @mapping, { + code => $code, + ucs => $ucs1, + ucs_second => $ucs2, + comment => $rest, + direction => 'both' + }; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) @@ -52,183 +51,31 @@ while ($line = ) $ucs = hex($u); $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR - "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf, - $ucs, $code; - next; - } - $count++; - $array{$utf} = $code; - $comment{$code} = $rest; -} -close(FILE); - -$file = "utf8_to_shift_jis_2004.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code, - $comment{$code}; - } - else - { - printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code, - $comment{$code}; - } -} - -print FILE "};\n"; -close(FILE); - -$file = "utf8_to_shift_jis_2004_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE -"static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {\n"; - -for $index (sort { $a cmp $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%s, 0x%s, 0x%04x} /* %s */\n", substr($index, 0, 8), - substr($index, 8, 8), $code, $comment1{$index}; - } - else - { - printf FILE " {0x%s, 0x%s, 0x%04x}, /* %s */\n", - substr($index, 0, 8), substr($index, 8, 8), $code, - $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); - -# then generate SHIFT_JIS_2004 --> UTF-8 table - -$in_file = "sjis-0213-2004-std.txt"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; - -while ($line = ) -{ - if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u1 = $2; - $u2 = $3; - $rest = "U+" . $u1 . "+" . $u2 . $4; - $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$code} = $str; - $comment1{$code} = $rest; - $count1++; - next; - } - elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u = $2; - $rest = "U+" . $u . $3; - } - else + if ($code < 0x80 && $ucs < 0x80) { next; } - - $ucs = hex($u); - $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") + elsif ($code < 0x80) { - printf STDERR - "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf, - $ucs, $code; - printf STDERR "Previous value: UTF8: %08x\n", $array{$utf}; - next; + $direction = 'from_unicode'; } - $count++; - - $array{$code} = $utf; - $comment{$utf} = $rest; -} -close(FILE); - -$file = "shift_jis_2004_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFTJIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) + elsif ($ucs < 0x80) { - printf FILE " {0x%04x, 0x%08x} /* %s */\n", $index, $code, - $comment{$code}; + $direction = 'to_unicode'; } else { - printf FILE " {0x%04x, 0x%08x}, /* %s */\n", $index, $code, - $comment{$code}; + $direction = 'both'; } -} -print FILE "};\n"; + push @mapping, { + code => $code, + ucs => $ucs, + comment => $rest, + direction => $direction + }; +} close(FILE); -$file = "shift_jis_2004_to_utf8_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE -"static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%04x, 0x%s, 0x%s} /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } - else - { - printf FILE " {0x%04x, 0x%s, 0x%s}, /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("SHIFT_JIS_2004", \@mapping, 1); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl index 10e54b157d..c8ff712af8 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl @@ -4,138 +4,45 @@ # # src/backend/utils/mb/Unicode/UCS_to_SJIS.pl # -# Generate UTF-8 <--> SJIS code conversion tables from -# map files provided by Unicode organization. -# Unfortunately it is prohibited by the organization -# to distribute the map files. So if you try to use this script, -# you have to obtain SHIFTJIS.TXT from -# the organization's ftp site. -# -# SHIFTJIS.TXT format: -# SHIFTJIS code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# Warning: SHIFTJIS.TXT contains only JIS0201 and JIS0208. no JIS0212. +# Generate UTF-8 <=> SJIS code conversion radix tree Generate UTF-8 +# <=> SJIS code conversion radix tree Unfortunately it is prohibited +# by the organization to distribute the map files. So if you try to +# use this script, you have to obtain CP932.TXT from the organization's +# ftp site. -require "ucs2utf.pl"; +use strict; +require "convutils.pm"; -# first generate UTF-8 --> SJIS table +my $charset = read_source("CP932.TXT"); -$in_file = "CP932.TXT"; -$count = 0; +# Drop these SJIS codes from the source for UTF8=>SJIS conversion +my @reject_sjis =( + 0xed40..0xeefc, 0x8754..0x875d, 0x878a, 0x8782, + 0x8784, 0xfa5b, 0xfa54, 0x8790..0x8792, 0x8795..0x8797, + 0x879a..0x879c +); -open(FILE, $in_file) || die("cannot open $in_file"); - -while () +foreach my $i (@$charset) { - chop; - if (/^#/) + my $code = $i->{code}; + my $ucs = $i->{ucs}; + + if (grep {$code == $_} @reject_sjis) { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ((($code >= 0xed40) && ($code <= 0xeefc)) - || ( ($code >= 0x8754) - && ($code <= 0x875d)) - || ($code == 0x878a) - || ($code == 0x8782) - || ($code == 0x8784) - || ($code == 0xfa5b) - || ($code == 0xfa54) - || ( ($code >= 0x8790) - && ($code <= 0x8792)) - || ( ($code >= 0x8795) - && ($code <= 0x8797)) - || ( ($code >= 0x879a) - && ($code <= 0x879c))) - { - printf STDERR "Warning: duplicate UTF8: UCS=0x%04x SJIS=0x%04x\n", - $ucs, - $code; - next; - } - $count++; - $array{$utf} = $code; + $i->{direction} = "to_unicode"; } } -close(FILE); +# Add these UTF8->SJIS pairs to the table. +push @$charset, ( + {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'}, + {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'}, + {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'}, + {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'}, + {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'}, + {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'}, + {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'}, + {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'} +); -$file = "utf8_to_sjis.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapSJIS[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - -# -# then generate SJIS --> UTF8 table -# - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; -$count = 0; - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - $count++; - - $array{$code} = $utf; - } -} -close(FILE); - -$file = "sjis_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapSJIS[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("SJIS", $charset); diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl new file mode 100755 index 0000000000..b6bf3bd8f2 --- /dev/null +++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl @@ -0,0 +1,51 @@ +#! /usr/bin/perl +# +# Copyright (c) 2007-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +# +# Generate UTF-8 <--> UHC code conversion tables from +# "windows-949-2000.xml", obtained from +# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# +# The lines we care about in the source file look like +# +# where the "u" field is the Unicode code point in hex, +# and the "b" field is the hex byte sequence for UHC + +require "convutils.pm"; + +# Read the input + +$in_file = "windows-949-2000.xml"; + +open(FILE, $in_file) || die("cannot open $in_file"); + +my @mapping; + +while () +{ + next if (!m/= 0x80 && $ucs >= 0x0080) + { + push @mapping, { + ucs => $ucs, + code => $code, + direction => 'both' + } + } +} +close(FILE); + +# One extra character that's not in the source file. +push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' }; + +print_tables("UHC", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl index 125378f149..a3cf436eef 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_most.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_most.pl @@ -15,7 +15,7 @@ # UCS-2 code in hex # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; %filename = ( 'WIN866' => 'CP866.TXT', @@ -44,121 +44,13 @@ require "ucs2utf.pl"; 'ISO8859_16' => '8859-16.TXT', 'KOI8R' => 'KOI8-R.TXT', 'KOI8U' => 'KOI8-U.TXT', - 'GBK' => 'CP936.TXT', - 'UHC' => 'CP949.TXT', - 'JOHAB' => 'JOHAB.TXT',); + 'GBK' => 'CP936.TXT'); @charsets = keys(%filename); @charsets = @ARGV if scalar(@ARGV); foreach $charset (@charsets) { + my $mapping = &read_source($filename{$charset}); - # - # first, generate UTF8-> charset table - # - $in_file = $filename{$charset}; - - open(FILE, $in_file) || die("cannot open $in_file"); - - reset 'array'; - - while () - { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - } - } - close(FILE); - - $file = lc("utf8_to_${charset}.map"); - open(FILE, "> $file") || die("cannot open $file"); - - print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; - print FILE "static const pg_utf_to_local ULmap${charset}[ $count ] = {\n"; - - for $index (sort { $a <=> $b } keys(%array)) - { - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } - } - - print FILE "};\n"; - close(FILE); - - # - # then generate character set code ->UTF8 table - # - open(FILE, $in_file) || die("cannot open $in_file"); - - reset 'array'; - - while () - { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$code} = $utf; - } - } - close(FILE); - - $file = lc("${charset}_to_utf8.map"); - open(FILE, "> $file") || die("cannot open $file"); - - print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; - print FILE "static const pg_local_to_utf LUmap${charset}[ $count ] = {\n"; - for $index (sort { $a <=> $b } keys(%array)) - { - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } - } - - print FILE "};\n"; - close(FILE); + print_tables($charset, $mapping); } diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm new file mode 100644 index 0000000000..d6a13e8c02 --- /dev/null +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -0,0 +1,282 @@ +# +# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/convutils.pm + +use strict; + +####################################################################### +# convert UCS-4 to UTF-8 +# +sub ucs2utf +{ + my ($ucs) = @_; + my $utf; + + if ($ucs <= 0x007f) + { + $utf = $ucs; + } + elsif ($ucs > 0x007f && $ucs <= 0x07ff) + { + $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); + } + elsif ($ucs > 0x07ff && $ucs <= 0xffff) + { + $utf = + ((($ucs >> 12) | 0xe0) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + else + { + $utf = + ((($ucs >> 18) | 0xf0) << 24) | + (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + return ($utf); +} + +####################################################################### +# read_source - common routine to read source file +# +# fname ; input file name +sub read_source +{ + my ($fname) = @_; + my @r; + + open(my $in, '<', $fname) || die("cannot open $fname"); + + while (<$in>) + { + next if (/^#/); + chop; + + next if (/^$/); # Ignore empty lines + + next if (/^0x([0-9A-F]+)\s+(#.*)$/); + + # Skip the first column for JIS0208.TXT + if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/) + { + print STDERR "READ ERROR at line $. in $fname: $_\n"; + exit; + } + my $out = {f => $fname, l => $., + code => hex($1), + ucs => hex($2), + comment => $4, + direction => "both" + }; + + # Ignore pure ASCII mappings. PostgreSQL character conversion code + # never even passes these to the conversion code. + next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); + + push(@r, $out); + } + close($in); + + return \@r; +} + +################################################################## +# print_tables : output mapping tables +# +# Arguments: +# charset - string name of the character set. +# table - mapping table (see format below) +# verbose - if 1, output comment on each line, +# if 2, also output source file name and number +# +# +# +# Mapping table format: +# +# Mapping table is a list of hashes. Each hash has the following fields: +# direction - Direction: 'both', 'from_unicode' or 'to_unicode' +# ucs - Unicode code point +# ucs_second - Second Unicode code point, if this is a "combined" character. +# code - Byte sequence in the "other" character set, as an integer +# comment - Text representation of the character +# f - Source filename +# l - Line number in source file +# +# +sub print_tables +{ + my ($charset, $table, $verbose) = @_; + + # Build an array with only the to-UTF8 direction mappings + my @to_unicode; + my @to_unicode_combined; + my @from_unicode; + my @from_unicode_combined; + + foreach my $i (@$table) + { + if (defined $i->{ucs_second}) + { + my $entry = {utf8 => ucs2utf($i->{ucs}), + utf8_second => ucs2utf($i->{ucs_second}), + code => $i->{code}, + comment => $i->{comment}, + f => $i->{f}, l => $i->{l}}; + if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") + { + push @to_unicode_combined, $entry; + } + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") + { + push @from_unicode_combined, $entry; + } + } + else + { + my $entry = {utf8 => ucs2utf($i->{ucs}), + code => $i->{code}, + comment => $i->{comment}, + f => $i->{f}, l => $i->{l}}; + if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") + { + push @to_unicode, $entry; + } + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") + { + push @from_unicode, $entry; + } + } + } + + print_to_utf8_map($charset, \@to_unicode, $verbose); + print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); + print_from_utf8_map($charset, \@from_unicode, $verbose); + print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); +} + +sub print_from_utf8_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("utf8_to_${charset}.map"); + print "- Writing UTF8=>${charset} conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". + "static const pg_utf_to_local ULmap${charset}[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code}); + if ($verbose >= 2) + { + $last_comment = "$$i{f}:$$i{l} $$i{comment}"; + } + else + { + $last_comment = $$i{comment}; + } + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_from_utf8_combined_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("utf8_to_${charset}_combined.map"); + print "- Writing UTF8=>${charset} conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". + "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code}); + $last_comment = "$$i{comment}"; + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_to_utf8_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("${charset}_to_utf8.map"); + + print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". + "static const pg_local_to_utf LUmap${charset}[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8}); + if ($verbose >= 2) + { + $last_comment = "$$i{f}:$$i{l} $$i{comment}"; + } + else + { + $last_comment = $$i{comment}; + } + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_to_utf8_combined_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("${charset}_to_utf8_combined.map"); + + print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". + "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second}); + $last_comment = "$$i{comment}"; + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +1; diff --git a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map index 2c3a607bf8..33fd42ac46 100644 --- a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map +++ b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_local_to_utf LUmapEUC_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map */ + +static const pg_local_to_utf LUmapEUC_JIS_2004[ 11303 ] = { /* */ {0x0080, 0xc280}, /* U+0080 */ {0x0081, 0xc281}, /* U+0081 */ {0x0082, 0xc282}, /* U+0082 */ @@ -205,7 +204,7 @@ static const pg_local_to_utf LUmapEUC_JIS_2004[] = { {0xa2ac, 0xe28691}, /* U+2191 UPWARDS ARROW */ {0xa2ad, 0xe28693}, /* U+2193 DOWNWARDS ARROW */ {0xa2ae, 0xe38093}, /* U+3013 GETA MARK */ - {0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0xa2b0, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */ {0xa2b1, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */ {0xa2b2, 0xefbd9e}, /* U+FF5E FULLWIDTH TILDE [2000] */ diff --git a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map index 7a7f85b105..2d8987b990 100644 --- a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map +++ b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map */ + +static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[ 25 ] = { /* */ {0xa4f7, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */ {0xa4f8, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */ {0xa4f9, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */ diff --git a/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map b/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map index db427cbb24..eb17f9829c 100644 --- a/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map +++ b/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map @@ -1,6 +1,6 @@ /* src/backend/utils/mb/Unicode/euc_jp_to_utf8.map */ -static const pg_local_to_utf LUmapEUC_JP[] = { +static const pg_local_to_utf LUmapEUC_JP[ 13197 ] = { {0x8ea1, 0xefbda1}, {0x8ea2, 0xefbda2}, {0x8ea3, 0xefbda3}, @@ -13197,5 +13197,5 @@ static const pg_local_to_utf LUmapEUC_JP[] = { {0x8ff4fb, 0xe9ab99}, {0x8ff4fc, 0xe9adb2}, {0x8ff4fd, 0xefa8ad}, - {0x8ff4fe, 0xe9bb91}, + {0x8ff4fe, 0xe9bb91} }; diff --git a/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map b/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map index e37152137d..701a7a476f 100644 --- a/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map +++ b/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/euc_kr_to_utf8.map */ + static const pg_local_to_utf LUmapEUC_KR[ 8227 ] = { {0xa1a1, 0xe38080}, {0xa1a2, 0xe38081}, diff --git a/src/backend/utils/mb/Unicode/johab_to_utf8.map b/src/backend/utils/mb/Unicode/johab_to_utf8.map index 8110f6e853..e31d24184c 100644 --- a/src/backend/utils/mb/Unicode/johab_to_utf8.map +++ b/src/backend/utils/mb/Unicode/johab_to_utf8.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/johab_to_utf8.map */ + static const pg_local_to_utf LUmapJOHAB[ 17049 ] = { {0x8444, 0xe384b3}, {0x8446, 0xe384b5}, diff --git a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map index 81c898c6be..958dde7b83 100644 --- a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map +++ b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFTJIS_2004.pl - */ -static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map */ + +static const pg_local_to_utf LUmapSHIFT_JIS_2004[ 11271 ] = { /* */ {0x00a1, 0xefbda1}, /* U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP */ {0x00a2, 0xefbda2}, /* U+FF62 HALFWIDTH LEFT CORNER BRACKET */ {0x00a3, 0xefbda3}, /* U+FF63 HALFWIDTH RIGHT CORNER BRACKET */ @@ -173,7 +172,7 @@ static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = { {0x81aa, 0xe28691}, /* U+2191 UPWARDS ARROW */ {0x81ab, 0xe28693}, /* U+2193 DOWNWARDS ARROW */ {0x81ac, 0xe38093}, /* U+3013 GETA MARK */ - {0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0x81ae, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */ {0x81af, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */ {0x81b0, 0x7e}, /* U+007E TILDE [2000] Fullwidth: U+FF5E */ diff --git a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map index b1c7bced5f..414e59dc40 100644 --- a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map +++ b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFT_JIS_2004.pl - */ -static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map */ + +static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[ 25 ] = { /* */ {0x82f5, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */ {0x82f6, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */ {0x82f7, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */ diff --git a/src/backend/utils/mb/Unicode/ucs2utf.pl b/src/backend/utils/mb/Unicode/ucs2utf.pl deleted file mode 100644 index e0f1fb226f..0000000000 --- a/src/backend/utils/mb/Unicode/ucs2utf.pl +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (c) 2001-2016, PostgreSQL Global Development Group -# -# src/backend/utils/mb/Unicode/ucs2utf.pl -# convert UCS-4 to UTF-8 -# -sub ucs2utf -{ - local ($ucs) = @_; - local $utf; - - if ($ucs <= 0x007f) - { - $utf = $ucs; - } - elsif ($ucs > 0x007f && $ucs <= 0x07ff) - { - $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); - } - elsif ($ucs > 0x07ff && $ucs <= 0xffff) - { - $utf = - ((($ucs >> 12) | 0xe0) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - else - { - $utf = - ((($ucs >> 18) | 0xf0) << 24) | - (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - return ($utf); -} -1; diff --git a/src/backend/utils/mb/Unicode/uhc_to_utf8.map b/src/backend/utils/mb/Unicode/uhc_to_utf8.map index 26a7b18f65..65c7e114a3 100644 --- a/src/backend/utils/mb/Unicode/uhc_to_utf8.map +++ b/src/backend/utils/mb/Unicode/uhc_to_utf8.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/uhc_to_utf8.map */ + static const pg_local_to_utf LUmapUHC[ 17237 ] = { {0x8141, 0xeab082}, {0x8142, 0xeab083}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map b/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map index b28eb9cc0c..3d64cd1a60 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_euc_cn.map */ + static const pg_utf_to_local ULmapEUC_CN[ 7445 ] = { {0xc2a4, 0xa1e8}, {0xc2a7, 0xa1ec}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map index 5137201217..b50e232b6c 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_utf_to_local ULmapEUC_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map */ + +static const pg_utf_to_local ULmapEUC_JIS_2004[ 11303 ] = { /* */ {0xc280, 0x0080}, /* U+0080 */ {0xc281, 0x0081}, /* U+0081 */ {0xc282, 0x0082}, /* U+0082 */ @@ -10849,7 +10848,7 @@ static const pg_utf_to_local ULmapEUC_JIS_2004[] = { {0xefbc84, 0xa1f0}, /* U+FF04 FULLWIDTH DOLLAR SIGN */ {0xefbc85, 0xa1f3}, /* U+FF05 FULLWIDTH PERCENT SIGN */ {0xefbc86, 0xa1f5}, /* U+FF06 FULLWIDTH AMPERSAND */ - {0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0xefbc88, 0xa1ca}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */ {0xefbc89, 0xa1cb}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */ {0xefbc8a, 0xa1f6}, /* U+FF0A FULLWIDTH ASTERISK */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map index d8ff5c0586..0d57667a55 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map */ + +static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[ 25 ] = { /* */ {0x0000c3a6, 0x0000cc80, 0xabc4}, /* U+00E6+0300 [2000] */ {0x0000c994, 0x0000cc80, 0xabc8}, /* U+0254+0300 [2000] */ {0x0000c994, 0x0000cc81, 0xabc9}, /* U+0254+0301 [2000] */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map index 137d4fdef6..eef6db65b3 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_euc_jp.map */ + static const pg_utf_to_local ULmapEUC_JP[ 13175 ] = { {0xc2a1, 0x8fa2c2}, {0xc2a4, 0x8fa2f0}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map b/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map index 4a78b260ea..a642b2154f 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_euc_kr.map */ + static const pg_utf_to_local ULmapEUC_KR[ 8227 ] = { {0xc2a1, 0xa2ae}, {0xc2a4, 0xa2b4}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_johab.map b/src/backend/utils/mb/Unicode/utf8_to_johab.map index 869f8213d2..78997d82d0 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_johab.map +++ b/src/backend/utils/mb/Unicode/utf8_to_johab.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_johab.map */ + static const pg_utf_to_local ULmapJOHAB[ 17049 ] = { {0xc2a1, 0xd9ae}, {0xc2a4, 0xd9b4}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map index 4fab64fc95..e9f9e638c6 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map +++ b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFT_JIS_2004.pl - */ -static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map */ + +static const pg_utf_to_local ULmapSHIFT_JIS_2004[ 11271 ] = { /* */ {0xc2a0, 0x8541}, /* U+00A0 NO-BREAK SPACE [2000] */ {0xc2a1, 0x8542}, /* U+00A1 INVERTED EXCLAMATION MARK [2000] */ {0xc2a2, 0x8191}, /* U+00A2 CENT SIGN Windows: U+FFE0 */ @@ -10817,7 +10816,7 @@ static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = { {0xefbc84, 0x8190}, /* U+FF04 FULLWIDTH DOLLAR SIGN */ {0xefbc85, 0x8193}, /* U+FF05 FULLWIDTH PERCENT SIGN */ {0xefbc86, 0x8195}, /* U+FF06 FULLWIDTH AMPERSAND */ - {0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0xefbc88, 0x8169}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */ {0xefbc89, 0x816a}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */ {0xefbc8a, 0x8196}, /* U+FF0A FULLWIDTH ASTERISK */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map index e55d4a2a6c..3642851fd6 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map +++ b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFT_JIS_2004.pl - */ -static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map */ + +static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[ 25 ] = { /* */ {0x0000c3a6, 0x0000cc80, 0x8663}, /* U+00E6+0300 [2000] */ {0x0000c994, 0x0000cc80, 0x8667}, /* U+0254+0300 [2000] */ {0x0000c994, 0x0000cc81, 0x8668}, /* U+0254+0301 [2000] */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_sjis.map b/src/backend/utils/mb/Unicode/utf8_to_sjis.map index fb0566a1db..cd6ea48ffc 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_sjis.map +++ b/src/backend/utils/mb/Unicode/utf8_to_sjis.map @@ -3,7 +3,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = { {0xc2a2, 0x8191}, {0xc2a3, 0x8192}, - {0xc2a5, 0x5c}, + {0xc2a5, 0x005c}, {0xc2a7, 0x8198}, {0xc2a8, 0x814e}, {0xc2ac, 0x81ca}, @@ -142,7 +142,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = { {0xe280b2, 0x818c}, {0xe280b3, 0x818d}, {0xe280bb, 0x81a6}, - {0xe280be, 0x7e}, + {0xe280be, 0x007e}, {0xe28483, 0x818e}, {0xe28496, 0xfa59}, {0xe284a1, 0xfa5a}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_uhc.map b/src/backend/utils/mb/Unicode/utf8_to_uhc.map index 15dfb56a09..dc04726364 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_uhc.map +++ b/src/backend/utils/mb/Unicode/utf8_to_uhc.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_uhc.map */ + static const pg_utf_to_local ULmapUHC[ 17237 ] = { {0xc2a1, 0xa2ae}, {0xc2a4, 0xa2b4},