diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index 9d2ef5e3d2..ea21f4a852 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \ win1258_to_utf8.map utf8_to_win1258.map GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \ - johab_to_utf8.map utf8_to_johab.map \ - uhc_to_utf8.map utf8_to_uhc.map \ gbk_to_utf8.map utf8_to_gbk.map \ koi8r_to_utf8.map utf8_to_koi8r.map @@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \ sjis_to_utf8.map utf8_to_sjis.map \ gb18030_to_utf8.map utf8_to_gb18030.map \ big5_to_utf8.map utf8_to_big5.map \ + johab_to_utf8.map utf8_to_johab.map \ + uhc_to_utf8.map utf8_to_uhc.map \ euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \ utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \ shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \ @@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \ 8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \ 8859-16.TXT -WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \ +WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \ CP1250.TXT CP1251.TXT \ CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \ CP1256.TXT CP1257.TXT CP1258.TXT GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \ - KOI8-R.TXT KOI8-U.TXT JOHAB.TXT + KOI8-R.TXT KOI8-U.TXT all: $(MAPS) $(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS) $(PERL) $< -euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT +johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT $(PERL) $< -euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT +uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml + $(PERL) $< + +euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT + $(PERL) $< + +euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml $(PERL) $< euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT @@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT: euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) -gb-18030-2000.xml: +gb-18030-2000.xml windows-949-2000.xml: $(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F) GB2312.TXT: @@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT: $(ISO8859TEXTS): $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F) -$(filter-out CP8%,$(WINTEXTS)): +$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT: $(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F) $(filter CP8%,$(WINTEXTS)): diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl index 127fd157b0..6a1321bab8 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl @@ -25,56 +25,17 @@ # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; +# Load BIG5.TXT +my $all = &read_source("BIG5.TXT"); -# -# first, generate UTF8 --> BIG5 table -# -$in_file = "BIG5.TXT"; +# Load CP950.TXT +my $cp950txt = &read_source("CP950.TXT"); -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - } -} -close(FILE); - -$in_file = "CP950.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); +foreach my $i (@$cp950txt) { + my $code = $i->{code}; + my $ucs = $i->{ucs}; # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc # from CP950.TXT @@ -83,126 +44,25 @@ while () && $code >= 0xf9d6 && $code <= 0xf9dc) { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - } -} -close(FILE); - -$file = lc("utf8_to_big5.map"); -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + push @$all, {code => $code, + ucs => $ucs, + comment => $i->{comment}, + direction => "both"}; } } -print FILE "};\n"; -close(FILE); +foreach my $i (@$all) { + my $code = $i->{code}; + my $ucs = $i->{ucs}; -# -# then generate BIG5 --> UTF8 table -# -$in_file = "BIG5.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) + # BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can + # contain only one of them. XXX: Doesn't really make sense to include any of them, + # but for historical reasons, we map the first one of them. + if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A) { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$code} = $utf; - } -} -close(FILE); - -$in_file = "CP950.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - - # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc - # from CP950.TXT - if ( $code >= 0x80 - && $ucs >= 0x0080 - && $code >= 0xf9d6 - && $code <= 0xf9dc) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$code} = $utf; - } -} -close(FILE); - -$file = lc("big5_to_utf8.map"); -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; + $i->{direction} = "to_unicode"; } } -print FILE "};\n"; -close(FILE); +# Output +print_tables("BIG5", $all); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl index 53f44773c9..8df23f8be6 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl @@ -1,128 +1,76 @@ #! /usr/bin/perl # -# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# Copyright (c) 2007-2016, PostgreSQL Global Development Group # -# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl +# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # -# Generate UTF-8 <--> EUC_CN code conversion tables from -# map files provided by Unicode organization. -# Unfortunately it is prohibited by the organization -# to distribute the map files. So if you try to use this script, -# you have to obtain GB2312.TXT from -# the organization's ftp site. +# Generate UTF-8 <--> GB18030 code conversion tables from +# "gb-18030-2000.xml", obtained from +# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ # -# GB2312.TXT format: -# GB2312 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) +# The lines we care about in the source file look like +# +# where the "u" field is the Unicode code point in hex, +# and the "b" field is the hex byte sequence for GB18030 -require "ucs2utf.pl"; +require "convutils.pm"; -# first generate UTF-8 --> EUC_CN table +# Read the input -$in_file = "GB2312.TXT"; +$in_file = "gb-18030-2000.xml"; open(FILE, $in_file) || die("cannot open $in_file"); -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8080); - } -} -close(FILE); - -$file = "utf8_to_euc_cn.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - -# -# then generate EUC_CN --> UTF8 table -# -reset 'array'; - -open(FILE, $in_file) || die("cannot open $in_file"); +my @mapping; while () { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; + next if (!m/= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - $code |= 0x8080; - $array{$code} = $utf; + # The GB-18030 character set, which we use as the source, contains + # a lot of extra characters on top of the GB2312 character set that + # EUC_CN encodes. Filter out those extra characters. + next if (($code & 0xFF) < 0xA1); + next if (!($code >= 0xA100 && $code <= 0xA9FF || + $code >= 0xB000 && $code <= 0xF7FF)); + + next if ($code >= 0xA2A1 && $code <= 0xA2B0); + next if ($code >= 0xA2E3 && $code <= 0xA2E4); + next if ($code >= 0xA2EF && $code <= 0xA2F0); + next if ($code >= 0xA2FD && $code <= 0xA2FE); + next if ($code >= 0xA4F4 && $code <= 0xA4FE); + next if ($code >= 0xA5F7 && $code <= 0xA5FE); + next if ($code >= 0xA6B9 && $code <= 0xA6C0); + next if ($code >= 0xA6D9 && $code <= 0xA6FE); + next if ($code >= 0xA7C2 && $code <= 0xA7D0); + next if ($code >= 0xA7F2 && $code <= 0xA7FE); + next if ($code >= 0xA8BB && $code <= 0xA8C4); + next if ($code >= 0xA8EA && $code <= 0xA8FE); + next if ($code >= 0xA9A1 && $code <= 0xA9A3); + next if ($code >= 0xA9F0 && $code <= 0xA9FE); + next if ($code >= 0xD7FA && $code <= 0xD7FE); + + # A couple of characters are mapped differently from GB-2312 or GB-18030 + if ($code == 0xA1A4) + { + $ucs = 0x30FB; + } + if ($code == 0xA1AA) + { + $ucs = 0x2015; + } + + push @mapping, { + ucs => $ucs, + code => $code, + direction => 'both' } } close(FILE); -$file = "euc_cn_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_CN", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl index d2f1b757cb..b4e140b657 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl @@ -7,9 +7,7 @@ # Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from # "euc-jis-2004-std.txt" (http://x0213.org) -require "ucs2utf.pl"; - -$TEST = 0; +require "convutils.pm"; # first generate UTF-8 --> EUC_JIS_2004 table @@ -17,10 +15,7 @@ $in_file = "euc-jis-2004-std.txt"; open(FILE, $in_file) || die("cannot open $in_file"); -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; +my @all; while ($line = ) { @@ -31,14 +26,14 @@ while ($line = ) $u2 = $3; $rest = "U+" . $u1 . "+" . $u2 . $4; $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$str} = $code; - $comment1{$str} = $rest; - $count1++; + $ucs1 = hex($u1); + $ucs2 = hex($u2); + + push @all, { direction => 'both', + ucs => $ucs1, + ucs_second => $ucs2, + code => $code, + comment => $rest }; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) @@ -54,252 +49,11 @@ while ($line = ) $ucs = hex($u); $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - $comment{$code} = $rest; + next if ($code < 0x80 && $ucs < 0x80); + + push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest }; } close(FILE); -$file = "utf8_to_euc_jis_2004.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_utf_to_local ULmapEUC_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code, - $comment{$code}; - } - else - { - printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code, - $comment{$code}; - } -} - -print FILE "};\n"; -close(FILE); - -if ($TEST == 1) -{ - $file1 = "utf8.data"; - $file2 = "euc_jis_2004.data"; - open(FILE1, "> $file1") || die("cannot open $file1"); - open(FILE2, "> $file2") || die("cannot open $file2"); - - for $index (sort { $a <=> $b } keys(%array)) - { - $code = $array{$index}; - if ( $code > 0x00 - && $code != 0x09 - && $code != 0x0a - && $code != 0x0d - && $code != 0x5c - && ( $code < 0x80 - || ($code >= 0x8ea1 && $code <= 0x8efe) - || ($code >= 0x8fa1a1 && $code <= 0x8ffefe) - || ($code >= 0xa1a1 && $code <= 0x8fefe))) - { - for ($i = 3; $i >= 0; $i--) - { - $s = $i * 8; - $mask = 0xff << $s; - print FILE1 pack("C", ($index & $mask) >> $s) - if $index & $mask; - print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask; - } - print FILE1 "\n"; - print FILE2 "\n"; - } - } -} - -$file = "utf8_to_euc_jis_2004_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE - "static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n"; - -for $index (sort { $a cmp $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8), - substr($index, 8, 8), $code, $comment1{$index}; - } - else - { - printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n", - substr($index, 0, 8), substr($index, 8, 8), $code, - $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); - -if ($TEST == 1) -{ - for $index (sort { $a cmp $b } keys(%array1)) - { - $code = $array1{$index}; - if ( $code > 0x00 - && $code != 0x09 - && $code != 0x0a - && $code != 0x0d - && $code != 0x5c - && ( $code < 0x80 - || ($code >= 0x8ea1 && $code <= 0x8efe) - || ($code >= 0x8fa1a1 && $code <= 0x8ffefe) - || ($code >= 0xa1a1 && $code <= 0x8fefe))) - { - - $v1 = hex(substr($index, 0, 8)); - $v2 = hex(substr($index, 8, 8)); - - for ($i = 3; $i >= 0; $i--) - { - $s = $i * 8; - $mask = 0xff << $s; - print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask; - print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask; - } - for ($i = 3; $i >= 0; $i--) - { - $s = $i * 8; - $mask = 0xff << $s; - print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask; - } - print FILE1 "\n"; - print FILE2 "\n"; - } - } - close(FILE1); - close(FILE2); -} - -# then generate EUC_JIS_2004 --> UTF-8 table - -$in_file = "euc-jis-2004-std.txt"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; - -while ($line = ) -{ - if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u1 = $2; - $u2 = $3; - $rest = "U+" . $u1 . "+" . $u2 . $4; - $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$code} = $str; - $comment1{$code} = $rest; - $count1++; - next; - } - elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u = $2; - $rest = "U+" . $u . $3; - } - else - { - next; - } - - $ucs = hex($u); - $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$code} = $utf; - $comment{$utf} = $rest; -} -close(FILE); - -$file = "euc_jis_2004_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_local_to_utf LUmapEUC_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code, - $comment{$code}; - } - else - { - printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code, - $comment{$code}; - } -} - -print FILE "};\n"; -close(FILE); - -$file = "euc_jis_2004_to_utf8_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE - "static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } - else - { - printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_JIS_2004", \@all, 1); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl index 055fc849ba..0e9dd292bf 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl @@ -8,275 +8,223 @@ # map files provided by Unicode organization. # Unfortunately it is prohibited by the organization # to distribute the map files. So if you try to use this script, -# you have to obtain JIS0201.TXT, JIS0208.TXT, JIS0212.TXT from -# the organization's ftp site. -# -# JIS0201.TXT format: -# JIS0201 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# -# JIS0208.TXT format: -# JIS0208 shift-JIS code in hex -# JIS0208 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# -# JIS0212.TXT format: -# JIS0212 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) +# you have to obtain CP932.TXT and JIS0212.TXT from the +# organization's ftp site. -require "ucs2utf.pl"; +use strict; +require "convutils.pm"; -# first generate UTF-8 --> EUC_JP table +# Load JIS0212.TXT +my $jis0212 = &read_source("JIS0212.TXT"); -# -# JIS0201 -# -$in_file = "JIS0201.TXT"; +my @mapping; -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) +foreach my $i (@$jis0212) { + # We have a different mapping for this in the EUC_JP to UTF-8 direction. + if ($i->{code} == 0x2243) { - next; + $i->{direction} = "from_unicode"; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - # add single shift 2 - $array{$utf} = ($code | 0x8e00); + if ($i->{code} == 0x2271) + { + $i->{direction} = "to_unicode"; } -} -close(FILE); -# -# JIS0208 -# -$in_file = "JIS0208.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) + if ($i->{ucs} >= 0x080) { - next; - } - ($s, $c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8080); - } -} -close(FILE); - -# -# JIS0212 -# -$in_file = "JIS0212.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8f8080); - } -} -close(FILE); - -$file = "utf8_to_euc_jp.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_JP[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; + $i->{code} = $i->{code} | 0x8f8080; } else { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + next; } + + push @mapping, $i; } -print FILE "};\n"; -close(FILE); +# Load CP932.TXT. +my $ct932 = &read_source("CP932.TXT"); -# -# then generate EUC_JP --> UTF8 table -# +foreach my $i (@$ct932) { + my $sjis = $i->{code}; -# -# JIS0201 -# -$in_file = "JIS0201.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; - -while () -{ - chop; - if (/^#/) + # We have a different mapping for this in the EUC_JP to UTF-8 direction. + if ($sjis == 0xeefa || + $sjis == 0xeefb || + $sjis == 0xeefc) { next; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - # add single shift 2 - $code |= 0x8e00; - $array{$code} = $utf; + if ($sjis >= 0xa1) + { + my $jis = &sjis2jis($sjis); + + $i->{code} = $jis | ($jis < 0x100 ? 0x8e00 : + ($sjis >= 0xeffd ? 0x8f8080 : 0x8080)); + + # Remember the SJIS code for later. + $i->{sjis} = $sjis; + + push @mapping, $i; } } -close(FILE); -# -# JIS0208 -# -$in_file = "JIS0208.TXT"; +foreach my $i (@mapping) { + my $sjis = $i->{sjis}; -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) + # These SJIS characters are excluded completely. + if ($sjis >= 0xed00 && $sjis <= 0xeef9 || + $sjis >= 0xfa54 && $sjis <= 0xfa56 || + $sjis >= 0xfa58 && $sjis <= 0xfc4b) { + $i->{direction} = "none"; next; } - ($s, $c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - - $code |= 0x8080; - $array{$code} = $utf; - } -} -close(FILE); - -# -# JIS0212 -# -$in_file = "JIS0212.TXT"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) + + # These SJIS characters are only in the UTF-8 to EUC_JP table + if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc) { + $i->{direction} = "from_unicode"; next; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - $code |= 0x8f8080; - $array{$code} = $utf; + if ($sjis == 0x8790 || $sjis == 0x8791 || $sjis == 0x8792 || + $sjis == 0x8795 || $sjis == 0x8796 || $sjis == 0x8797 || + $sjis == 0x879a || $sjis == 0x879b || $sjis == 0x879c || + ($sjis >= 0xfa4a && $sjis <= 0xfa53)) + { + $i->{direction} = "to_unicode"; + next; } } -close(FILE); -$file = "euc_jp_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); +push @mapping, ( + {direction => 'both', ucs => 0x4efc, code => 0x8ff4af, comment => '# CJK(4EFC)'}, + {direction => 'both', ucs => 0x50f4, code => 0x8ff4b0, comment => '# CJK(50F4)'}, + {direction => 'both', ucs => 0x51EC, code => 0x8ff4b1, comment => '# CJK(51EC)'}, + {direction => 'both', ucs => 0x5307, code => 0x8ff4b2, comment => '# CJK(5307)'}, + {direction => 'both', ucs => 0x5324, code => 0x8ff4b3, comment => '# CJK(5324)'}, + {direction => 'both', ucs => 0x548A, code => 0x8ff4b5, comment => '# CJK(548A)'}, + {direction => 'both', ucs => 0x5759, code => 0x8ff4b6, comment => '# CJK(5759)'}, + {direction => 'both', ucs => 0x589E, code => 0x8ff4b9, comment => '# CJK(589E)'}, + {direction => 'both', ucs => 0x5BEC, code => 0x8ff4ba, comment => '# CJK(5BEC)'}, + {direction => 'both', ucs => 0x5CF5, code => 0x8ff4bb, comment => '# CJK(5CF5)'}, + {direction => 'both', ucs => 0x5D53, code => 0x8ff4bc, comment => '# CJK(5D53)'}, + {direction => 'both', ucs => 0x5FB7, code => 0x8ff4be, comment => '# CJK(5FB7)'}, + {direction => 'both', ucs => 0x6085, code => 0x8ff4bf, comment => '# CJK(6085)'}, + {direction => 'both', ucs => 0x6120, code => 0x8ff4c0, comment => '# CJK(6120)'}, + {direction => 'both', ucs => 0x654E, code => 0x8ff4c1, comment => '# CJK(654E)'}, + {direction => 'both', ucs => 0x663B, code => 0x8ff4c2, comment => '# CJK(663B)'}, + {direction => 'both', ucs => 0x6665, code => 0x8ff4c3, comment => '# CJK(6665)'}, + {direction => 'both', ucs => 0x6801, code => 0x8ff4c6, comment => '# CJK(6801)'}, + {direction => 'both', ucs => 0x6A6B, code => 0x8ff4c9, comment => '# CJK(6A6B)'}, + {direction => 'both', ucs => 0x6AE2, code => 0x8ff4ca, comment => '# CJK(6AE2)'}, + {direction => 'both', ucs => 0x6DF2, code => 0x8ff4cc, comment => '# CJK(6DF2)'}, + {direction => 'both', ucs => 0x6DF8, code => 0x8ff4cb, comment => '# CJK(6DF8)'}, + {direction => 'both', ucs => 0x7028, code => 0x8ff4cd, comment => '# CJK(7028)'}, + {direction => 'both', ucs => 0x70BB, code => 0x8ff4ae, comment => '# CJK(70BB)'}, + {direction => 'both', ucs => 0x7501, code => 0x8ff4d0, comment => '# CJK(7501)'}, + {direction => 'both', ucs => 0x7682, code => 0x8ff4d1, comment => '# CJK(7682)'}, + {direction => 'both', ucs => 0x769E, code => 0x8ff4d2, comment => '# CJK(769E)'}, + {direction => 'both', ucs => 0x7930, code => 0x8ff4d4, comment => '# CJK(7930)'}, + {direction => 'both', ucs => 0x7AE7, code => 0x8ff4d9, comment => '# CJK(7AE7)'}, + {direction => 'both', ucs => 0x7DA0, code => 0x8ff4dc, comment => '# CJK(7DA0)'}, + {direction => 'both', ucs => 0x7DD6, code => 0x8ff4dd, comment => '# CJK(7DD6)'}, + {direction => 'both', ucs => 0x8362, code => 0x8ff4df, comment => '# CJK(8362)'}, + {direction => 'both', ucs => 0x85B0, code => 0x8ff4e1, comment => '# CJK(85B0)'}, + {direction => 'both', ucs => 0x8807, code => 0x8ff4e4, comment => '# CJK(8807)'}, + {direction => 'both', ucs => 0x8B7F, code => 0x8ff4e6, comment => '# CJK(8B7F)'}, + {direction => 'both', ucs => 0x8CF4, code => 0x8ff4e7, comment => '# CJK(8CF4)'}, + {direction => 'both', ucs => 0x8D76, code => 0x8ff4e8, comment => '# CJK(8D76)'}, + {direction => 'both', ucs => 0x90DE, code => 0x8ff4ec, comment => '# CJK(90DE)'}, + {direction => 'both', ucs => 0x9115, code => 0x8ff4ee, comment => '# CJK(9115)'}, + {direction => 'both', ucs => 0x9592, code => 0x8ff4f1, comment => '# CJK(9592)'}, + {direction => 'both', ucs => 0x973B, code => 0x8ff4f4, comment => '# CJK(973B)'}, + {direction => 'both', ucs => 0x974D, code => 0x8ff4f5, comment => '# CJK(974D)'}, + {direction => 'both', ucs => 0x9751, code => 0x8ff4f6, comment => '# CJK(9751)'}, + {direction => 'both', ucs => 0x999E, code => 0x8ff4fa, comment => '# CJK(999E)'}, + {direction => 'both', ucs => 0x9AD9, code => 0x8ff4fb, comment => '# CJK(9AD9)'}, + {direction => 'both', ucs => 0x9B72, code => 0x8ff4fc, comment => '# CJK(9B72)'}, + {direction => 'both', ucs => 0x9ED1, code => 0x8ff4fe, comment => '# CJK(9ED1)'}, + {direction => 'both', ucs => 0xF929, code => 0x8ff4c5, comment => '# CJK COMPATIBILITY IDEOGRAPH-F929'}, + {direction => 'both', ucs => 0xF9DC, code => 0x8ff4f2, comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'}, + {direction => 'both', ucs => 0xFA0E, code => 0x8ff4b4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'}, + {direction => 'both', ucs => 0xFA0F, code => 0x8ff4b7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'}, + {direction => 'both', ucs => 0xFA10, code => 0x8ff4b8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10'}, + {direction => 'both', ucs => 0xFA11, code => 0x8ff4bd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11'}, + {direction => 'both', ucs => 0xFA12, code => 0x8ff4c4, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12'}, + {direction => 'both', ucs => 0xFA13, code => 0x8ff4c7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13'}, + {direction => 'both', ucs => 0xFA14, code => 0x8ff4c8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14'}, + {direction => 'both', ucs => 0xFA15, code => 0x8ff4ce, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15'}, + {direction => 'both', ucs => 0xFA16, code => 0x8ff4cf, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16'}, + {direction => 'both', ucs => 0xFA17, code => 0x8ff4d3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17'}, + {direction => 'both', ucs => 0xFA18, code => 0x8ff4d5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18'}, + {direction => 'both', ucs => 0xFA19, code => 0x8ff4d6, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19'}, + {direction => 'both', ucs => 0xFA1A, code => 0x8ff4d7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'}, + {direction => 'both', ucs => 0xFA1B, code => 0x8ff4d8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'}, + {direction => 'both', ucs => 0xFA1C, code => 0x8ff4da, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'}, + {direction => 'both', ucs => 0xFA1D, code => 0x8ff4db, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'}, + {direction => 'both', ucs => 0xFA1E, code => 0x8ff4de, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'}, + {direction => 'both', ucs => 0xFA1F, code => 0x8ff4e0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'}, + {direction => 'both', ucs => 0xFA20, code => 0x8ff4e2, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20'}, + {direction => 'both', ucs => 0xFA21, code => 0x8ff4e3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21'}, + {direction => 'both', ucs => 0xFA22, code => 0x8ff4e5, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22'}, + {direction => 'both', ucs => 0xFA23, code => 0x8ff4e9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23'}, + {direction => 'both', ucs => 0xFA24, code => 0x8ff4ea, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24'}, + {direction => 'both', ucs => 0xFA25, code => 0x8ff4eb, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25'}, + {direction => 'both', ucs => 0xFA26, code => 0x8ff4ed, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26'}, + {direction => 'both', ucs => 0xFA27, code => 0x8ff4ef, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27'}, + {direction => 'both', ucs => 0xFA28, code => 0x8ff4f0, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28'}, + {direction => 'both', ucs => 0xFA29, code => 0x8ff4f3, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29'}, + {direction => 'both', ucs => 0xFA2A, code => 0x8ff4f7, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'}, + {direction => 'both', ucs => 0xFA2B, code => 0x8ff4f8, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'}, + {direction => 'both', ucs => 0xFA2C, code => 0x8ff4f9, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'}, + {direction => 'both', ucs => 0xFA2D, code => 0x8ff4fd, comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'}, + {direction => 'both', ucs => 0xFF07, code => 0x8ff4a9, comment => '# FULLWIDTH APOSTROPHE'}, + {direction => 'both', ucs => 0xFFE4, code => 0x8fa2c3, comment => '# FULLWIDTH BROKEN BAR'}, -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_JP[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) + # additional conversions for EUC_JP -> UTF-8 conversion + {direction => 'to_unicode', ucs => 0x2116, code => 0x8ff4ac, comment => '# NUMERO SIGN'}, + {direction => 'to_unicode', ucs => 0x2121, code => 0x8ff4ad, comment => '# TELEPHONE SIGN'}, + {direction => 'to_unicode', ucs => 0x3231, code => 0x8ff4ab, comment => '# PARENTHESIZED IDEOGRAPH STOCK'} + ); + +print_tables("EUC_JP", \@mapping); + +####################################################################### +# sjis2jis ; SJIS => JIS conversion +sub sjis2jis { - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} + my ($sjis) = @_; -print FILE "};\n"; -close(FILE); + return $sjis if ($sjis <= 0x100); + + my $hi = $sjis >> 8; + my $lo = $sjis & 0xff; + + if ($lo >= 0x80) { $lo--; } + $lo -= 0x40; + if ($hi >= 0xe0) { $hi -= 0x40; } + $hi -= 0x81; + my $pos = $lo + $hi * 0xbc; + + if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b) + { + # This region (115-ku) is out of range of JIS code but for + # convenient to generate code in EUC CODESET 3, move this to + # seemingly duplicate region (83-84-ku). + $pos = $pos - ((31 * 0x5e) + 12); + + # after 85-ku 82-ten needs to be moved 2 codepoints + $pos = $pos - 2 if ($pos >= 84 * 0x5c + 82) + } + + my $hi2 = $pos / 0x5e; + my $lo2 = ($pos % 0x5e); + + my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8); + + return $ret; +} diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl index a7c94bca91..a917d06717 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl @@ -16,113 +16,22 @@ # UCS-2 code in hex # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; -# first generate UTF-8 --> EUC_KR table +# Load the source file. -$in_file = "KSX1001.TXT"; +my $mapping = &read_source("KSX1001.TXT"); -open(FILE, $in_file) || die("cannot open $in_file"); - -while () +foreach my $i (@$mapping) { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $array{$utf} = ($code | 0x8080); - } -} -close(FILE); - -$file = "utf8_to_euc_kr.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_KR[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } + $i->{code} = $i->{code} | 0x8080; } -print FILE "};\n"; -close(FILE); +# Some extra characters that are not in KSX1001.TXT +push @$mapping, ( + {direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'}, + {direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'}, + {direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'} + ); -# -# then generate EUC_KR --> UTF8 table -# -reset 'array'; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - - $code |= 0x8080; - $array{$code} = $utf; - } -} -close(FILE); - -$file = "euc_kr_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_KR[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_KR", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl index e4fc535b18..aceef5433c 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl @@ -17,141 +17,47 @@ # UCS-2 code in hex # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; -# first generate UTF-8 --> EUC_TW table +my $mapping = &read_source("CNS11643.TXT"); -$in_file = "CNS11643.TXT"; +my @extras; -open(FILE, $in_file) || die("cannot open $in_file"); - -while () +foreach my $i (@$mapping) { - chop; - if (/^#/) + my $ucs = $i->{ucs}; + my $code = $i->{code}; + my $origcode = $i->{code}; + + my $plane = ($code & 0x1f0000) >> 16; + if ($plane > 16) { + printf STDERR "Warning: invalid plane No.$plane. ignored\n"; next; } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) + + if ($plane == 1) { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - - $plane = ($code & 0x1f0000) >> 16; - if ($plane > 16) - { - printf STDERR "Warning: invalid plane No.$plane. ignored\n"; - next; - } - - if ($plane == 1) - { - $array{$utf} = (($code & 0xffff) | 0x8080); - } - else - { - $array{$utf} = - (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080); - } - } -} -close(FILE); - -$file = "utf8_to_euc_tw.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapEUC_TW[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; + $code = ($code & 0xffff) | 0x8080; } else { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + $code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080); + } + $i->{code} = $code; + + # Some codes are mapped twice in the EUC_TW to UTF-8 table. + if ($origcode >= 0x12121 && $origcode <= 0x20000) + { + push @extras, { + ucs => $i->{ucs}, + code => ($i->{code} + 0x8ea10000), + rest => $i->{rest}, + direction => 'to_unicode' + } } } -print FILE "};\n"; -close(FILE); +push @$mapping, @extras; -# -# then generate EUC_TW --> UTF8 table -# -reset 'array'; - -open(FILE, $in_file) || die("cannot open $in_file"); - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate code: %04x\n", $ucs; - next; - } - $count++; - - $plane = ($code & 0x1f0000) >> 16; - if ($plane > 16) - { - printf STDERR "Warning: invalid plane No.$plane. ignored\n"; - next; - } - - if ($plane == 1) - { - $c = (($code & 0xffff) | 0x8080); - $array{$c} = $utf; - $count++; - } - $c = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080); - $array{$c} = $utf; - } -} -close(FILE); - -$file = "euc_tw_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapEUC_TW[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("EUC_TW", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl index 043c1c27ec..f58361024e 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl @@ -13,8 +13,7 @@ # where the "u" field is the Unicode code point in hex, # and the "b" field is the hex byte sequence for GB18030 -require "ucs2utf.pl"; - +require "convutils.pm"; # Read the input @@ -22,6 +21,8 @@ $in_file = "gb-18030-2000.xml"; open(FILE, $in_file) || die("cannot open $in_file"); +my @mapping; + while () { next if (!m/) $code = hex($c); if ($code >= 0x80 && $ucs >= 0x0080) { - $utf = &ucs2utf($ucs); - if ($arrayu{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; + push @mapping, { + ucs => $ucs, + code => $code, + direction => 'both' } - if ($arrayc{$code} ne "") - { - printf STDERR "Warning: duplicate GB18030: %08x\n", $code; - next; - } - $arrayu{$utf} = $code; - $arrayc{$code} = $utf; - $count++; } } close(FILE); - -# -# first, generate UTF8 --> GB18030 table -# - -$file = "utf8_to_gb18030.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n"; - -$cc = $count; -for $index (sort { $a <=> $b } keys(%arrayu)) -{ - $code = $arrayu{$index}; - $cc--; - if ($cc == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - - -# -# then generate GB18030 --> UTF8 table -# - -$file = "gb18030_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n"; - -$cc = $count; -for $index (sort { $a <=> $b } keys(%arrayc)) -{ - $utf = $arrayc{$index}; - $cc--; - if ($cc == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("GB18030", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl new file mode 100755 index 0000000000..b98f9a7bf5 --- /dev/null +++ b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl @@ -0,0 +1,31 @@ +#! /usr/bin/perl +# +# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl +# +# Generate UTF-8 <--> JOHAB conversion tables from +# map files provided by Unicode organization. +# Unfortunately it is prohibited by the organization +# to distribute the map files. So if you try to use this script, +# you have to obtain the map files from the organization's ftp site. +# ftp://www.unicode.org/Public/MAPPINGS/ +# We assume the file include three tab-separated columns: +# JOHAB code in hex +# UCS-2 code in hex +# # and Unicode name (not used in this script) + +require "convutils.pm"; + +# Load the source file. + +my $mapping = &read_source("JOHAB.TXT"); + +# Some extra characters that are not in JOHAB.TXT +push @$mapping, ( + {direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'}, + {direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'}, + {direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'} + ); + +print_tables("JOHAB", $mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl index 51ffd86b2c..16a53ad1d9 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl @@ -7,7 +7,7 @@ # Generate UTF-8 <--> SHIFT_JIS_2004 code conversion tables from # "sjis-0213-2004-std.txt" (http://x0213.org) -require "ucs2utf.pl"; +require "convutils.pm"; # first generate UTF-8 --> SHIFT_JIS_2004 table @@ -15,10 +15,7 @@ $in_file = "sjis-0213-2004-std.txt"; open(FILE, $in_file) || die("cannot open $in_file"); -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; +my @mapping; while ($line = ) { @@ -29,14 +26,16 @@ while ($line = ) $u2 = $3; $rest = "U+" . $u1 . "+" . $u2 . $4; $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$str} = $code; - $comment1{$str} = $rest; - $count1++; + $ucs1 = hex($u1); + $ucs2 = hex($u2); + + push @mapping, { + code => $code, + ucs => $ucs1, + ucs_second => $ucs2, + comment => $rest, + direction => 'both' + }; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) @@ -52,183 +51,31 @@ while ($line = ) $ucs = hex($u); $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR - "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf, - $ucs, $code; - next; - } - $count++; - $array{$utf} = $code; - $comment{$code} = $rest; -} -close(FILE); - -$file = "utf8_to_shift_jis_2004.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code, - $comment{$code}; - } - else - { - printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code, - $comment{$code}; - } -} - -print FILE "};\n"; -close(FILE); - -$file = "utf8_to_shift_jis_2004_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE -"static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {\n"; - -for $index (sort { $a cmp $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%s, 0x%s, 0x%04x} /* %s */\n", substr($index, 0, 8), - substr($index, 8, 8), $code, $comment1{$index}; - } - else - { - printf FILE " {0x%s, 0x%s, 0x%04x}, /* %s */\n", - substr($index, 0, 8), substr($index, 8, 8), $code, - $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); - -# then generate SHIFT_JIS_2004 --> UTF-8 table - -$in_file = "sjis-0213-2004-std.txt"; - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; -reset 'array1'; -reset 'comment'; -reset 'comment1'; - -while ($line = ) -{ - if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u1 = $2; - $u2 = $3; - $rest = "U+" . $u1 . "+" . $u2 . $4; - $code = hex($c); - $ucs = hex($u1); - $utf1 = &ucs2utf($ucs); - $ucs = hex($u2); - $utf2 = &ucs2utf($ucs); - $str = sprintf "%08x%08x", $utf1, $utf2; - $array1{$code} = $str; - $comment1{$code} = $rest; - $count1++; - next; - } - elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) - { - $c = $1; - $u = $2; - $rest = "U+" . $u . $3; - } - else + if ($code < 0x80 && $ucs < 0x80) { next; } - - $ucs = hex($u); - $code = hex($c); - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") + elsif ($code < 0x80) { - printf STDERR - "Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf, - $ucs, $code; - printf STDERR "Previous value: UTF8: %08x\n", $array{$utf}; - next; + $direction = 'from_unicode'; } - $count++; - - $array{$code} = $utf; - $comment{$utf} = $rest; -} -close(FILE); - -$file = "shift_jis_2004_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFTJIS_2004.pl\n"; -print FILE " */\n"; -print FILE "static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) + elsif ($ucs < 0x80) { - printf FILE " {0x%04x, 0x%08x} /* %s */\n", $index, $code, - $comment{$code}; + $direction = 'to_unicode'; } else { - printf FILE " {0x%04x, 0x%08x}, /* %s */\n", $index, $code, - $comment{$code}; + $direction = 'both'; } -} -print FILE "};\n"; + push @mapping, { + code => $code, + ucs => $ucs, + comment => $rest, + direction => $direction + }; +} close(FILE); -$file = "shift_jis_2004_to_utf8_combined.map"; -open(FILE, "> $file") || die("cannot open $file"); -print FILE "/*\n"; -print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n"; -print FILE " */\n"; -print FILE -"static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {\n"; - -for $index (sort { $a <=> $b } keys(%array1)) -{ - $code = $array1{$index}; - $count1--; - if ($count1 == 0) - { - printf FILE " {0x%04x, 0x%s, 0x%s} /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } - else - { - printf FILE " {0x%04x, 0x%s, 0x%s}, /* %s */\n", $index, - substr($code, 0, 8), substr($code, 8, 8), $comment1{$index}; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("SHIFT_JIS_2004", \@mapping, 1); diff --git a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl index 10e54b157d..c8ff712af8 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SJIS.pl @@ -4,138 +4,45 @@ # # src/backend/utils/mb/Unicode/UCS_to_SJIS.pl # -# Generate UTF-8 <--> SJIS code conversion tables from -# map files provided by Unicode organization. -# Unfortunately it is prohibited by the organization -# to distribute the map files. So if you try to use this script, -# you have to obtain SHIFTJIS.TXT from -# the organization's ftp site. -# -# SHIFTJIS.TXT format: -# SHIFTJIS code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# Warning: SHIFTJIS.TXT contains only JIS0201 and JIS0208. no JIS0212. +# Generate UTF-8 <=> SJIS code conversion radix tree Generate UTF-8 +# <=> SJIS code conversion radix tree Unfortunately it is prohibited +# by the organization to distribute the map files. So if you try to +# use this script, you have to obtain CP932.TXT from the organization's +# ftp site. -require "ucs2utf.pl"; +use strict; +require "convutils.pm"; -# first generate UTF-8 --> SJIS table +my $charset = read_source("CP932.TXT"); -$in_file = "CP932.TXT"; -$count = 0; +# Drop these SJIS codes from the source for UTF8=>SJIS conversion +my @reject_sjis =( + 0xed40..0xeefc, 0x8754..0x875d, 0x878a, 0x8782, + 0x8784, 0xfa5b, 0xfa54, 0x8790..0x8792, 0x8795..0x8797, + 0x879a..0x879c +); -open(FILE, $in_file) || die("cannot open $in_file"); - -while () +foreach my $i (@$charset) { - chop; - if (/^#/) + my $code = $i->{code}; + my $ucs = $i->{ucs}; + + if (grep {$code == $_} @reject_sjis) { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ((($code >= 0xed40) && ($code <= 0xeefc)) - || ( ($code >= 0x8754) - && ($code <= 0x875d)) - || ($code == 0x878a) - || ($code == 0x8782) - || ($code == 0x8784) - || ($code == 0xfa5b) - || ($code == 0xfa54) - || ( ($code >= 0x8790) - && ($code <= 0x8792)) - || ( ($code >= 0x8795) - && ($code <= 0x8797)) - || ( ($code >= 0x879a) - && ($code <= 0x879c))) - { - printf STDERR "Warning: duplicate UTF8: UCS=0x%04x SJIS=0x%04x\n", - $ucs, - $code; - next; - } - $count++; - $array{$utf} = $code; + $i->{direction} = "to_unicode"; } } -close(FILE); +# Add these UTF8->SJIS pairs to the table. +push @$charset, ( + {direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'}, + {direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'}, + {direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'}, + {direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'}, + {direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'}, + {direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'}, + {direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'}, + {direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'} +); -$file = "utf8_to_sjis.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_utf_to_local ULmapSJIS[ $count ] = {\n"; - -for $index (sort { $a <=> $b } keys(%array)) -{ - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - -# -# then generate SJIS --> UTF8 table -# - -open(FILE, $in_file) || die("cannot open $in_file"); - -reset 'array'; -$count = 0; - -while () -{ - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - $count++; - - $array{$code} = $utf; - } -} -close(FILE); - -$file = "sjis_to_utf8.map"; -open(FILE, "> $file") || die("cannot open $file"); - -print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; -print FILE "static const pg_local_to_utf LUmapSJIS[ $count ] = {\n"; -for $index (sort { $a <=> $b } keys(%array)) -{ - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); +print_tables("SJIS", $charset); diff --git a/src/backend/utils/mb/Unicode/UCS_to_UHC.pl b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl new file mode 100755 index 0000000000..b6bf3bd8f2 --- /dev/null +++ b/src/backend/utils/mb/Unicode/UCS_to_UHC.pl @@ -0,0 +1,51 @@ +#! /usr/bin/perl +# +# Copyright (c) 2007-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl +# +# Generate UTF-8 <--> UHC code conversion tables from +# "windows-949-2000.xml", obtained from +# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ +# +# The lines we care about in the source file look like +# +# where the "u" field is the Unicode code point in hex, +# and the "b" field is the hex byte sequence for UHC + +require "convutils.pm"; + +# Read the input + +$in_file = "windows-949-2000.xml"; + +open(FILE, $in_file) || die("cannot open $in_file"); + +my @mapping; + +while () +{ + next if (!m/= 0x80 && $ucs >= 0x0080) + { + push @mapping, { + ucs => $ucs, + code => $code, + direction => 'both' + } + } +} +close(FILE); + +# One extra character that's not in the source file. +push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' }; + +print_tables("UHC", \@mapping); diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl index 125378f149..a3cf436eef 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_most.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_most.pl @@ -15,7 +15,7 @@ # UCS-2 code in hex # # and Unicode name (not used in this script) -require "ucs2utf.pl"; +require "convutils.pm"; %filename = ( 'WIN866' => 'CP866.TXT', @@ -44,121 +44,13 @@ require "ucs2utf.pl"; 'ISO8859_16' => '8859-16.TXT', 'KOI8R' => 'KOI8-R.TXT', 'KOI8U' => 'KOI8-U.TXT', - 'GBK' => 'CP936.TXT', - 'UHC' => 'CP949.TXT', - 'JOHAB' => 'JOHAB.TXT',); + 'GBK' => 'CP936.TXT'); @charsets = keys(%filename); @charsets = @ARGV if scalar(@ARGV); foreach $charset (@charsets) { + my $mapping = &read_source($filename{$charset}); - # - # first, generate UTF8-> charset table - # - $in_file = $filename{$charset}; - - open(FILE, $in_file) || die("cannot open $in_file"); - - reset 'array'; - - while () - { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$utf} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$utf} = $code; - } - } - close(FILE); - - $file = lc("utf8_to_${charset}.map"); - open(FILE, "> $file") || die("cannot open $file"); - - print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; - print FILE "static const pg_utf_to_local ULmap${charset}[ $count ] = {\n"; - - for $index (sort { $a <=> $b } keys(%array)) - { - $code = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } - } - - print FILE "};\n"; - close(FILE); - - # - # then generate character set code ->UTF8 table - # - open(FILE, $in_file) || die("cannot open $in_file"); - - reset 'array'; - - while () - { - chop; - if (/^#/) - { - next; - } - ($c, $u, $rest) = split; - $ucs = hex($u); - $code = hex($c); - if ($code >= 0x80 && $ucs >= 0x0080) - { - $utf = &ucs2utf($ucs); - if ($array{$code} ne "") - { - printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; - next; - } - $count++; - $array{$code} = $utf; - } - } - close(FILE); - - $file = lc("${charset}_to_utf8.map"); - open(FILE, "> $file") || die("cannot open $file"); - - print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; - print FILE "static const pg_local_to_utf LUmap${charset}[ $count ] = {\n"; - for $index (sort { $a <=> $b } keys(%array)) - { - $utf = $array{$index}; - $count--; - if ($count == 0) - { - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } - else - { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } - } - - print FILE "};\n"; - close(FILE); + print_tables($charset, $mapping); } diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm new file mode 100644 index 0000000000..d6a13e8c02 --- /dev/null +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -0,0 +1,282 @@ +# +# Copyright (c) 2001-2016, PostgreSQL Global Development Group +# +# src/backend/utils/mb/Unicode/convutils.pm + +use strict; + +####################################################################### +# convert UCS-4 to UTF-8 +# +sub ucs2utf +{ + my ($ucs) = @_; + my $utf; + + if ($ucs <= 0x007f) + { + $utf = $ucs; + } + elsif ($ucs > 0x007f && $ucs <= 0x07ff) + { + $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); + } + elsif ($ucs > 0x07ff && $ucs <= 0xffff) + { + $utf = + ((($ucs >> 12) | 0xe0) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + else + { + $utf = + ((($ucs >> 18) | 0xf0) << 24) | + (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | + (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); + } + return ($utf); +} + +####################################################################### +# read_source - common routine to read source file +# +# fname ; input file name +sub read_source +{ + my ($fname) = @_; + my @r; + + open(my $in, '<', $fname) || die("cannot open $fname"); + + while (<$in>) + { + next if (/^#/); + chop; + + next if (/^$/); # Ignore empty lines + + next if (/^0x([0-9A-F]+)\s+(#.*)$/); + + # Skip the first column for JIS0208.TXT + if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/) + { + print STDERR "READ ERROR at line $. in $fname: $_\n"; + exit; + } + my $out = {f => $fname, l => $., + code => hex($1), + ucs => hex($2), + comment => $4, + direction => "both" + }; + + # Ignore pure ASCII mappings. PostgreSQL character conversion code + # never even passes these to the conversion code. + next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); + + push(@r, $out); + } + close($in); + + return \@r; +} + +################################################################## +# print_tables : output mapping tables +# +# Arguments: +# charset - string name of the character set. +# table - mapping table (see format below) +# verbose - if 1, output comment on each line, +# if 2, also output source file name and number +# +# +# +# Mapping table format: +# +# Mapping table is a list of hashes. Each hash has the following fields: +# direction - Direction: 'both', 'from_unicode' or 'to_unicode' +# ucs - Unicode code point +# ucs_second - Second Unicode code point, if this is a "combined" character. +# code - Byte sequence in the "other" character set, as an integer +# comment - Text representation of the character +# f - Source filename +# l - Line number in source file +# +# +sub print_tables +{ + my ($charset, $table, $verbose) = @_; + + # Build an array with only the to-UTF8 direction mappings + my @to_unicode; + my @to_unicode_combined; + my @from_unicode; + my @from_unicode_combined; + + foreach my $i (@$table) + { + if (defined $i->{ucs_second}) + { + my $entry = {utf8 => ucs2utf($i->{ucs}), + utf8_second => ucs2utf($i->{ucs_second}), + code => $i->{code}, + comment => $i->{comment}, + f => $i->{f}, l => $i->{l}}; + if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") + { + push @to_unicode_combined, $entry; + } + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") + { + push @from_unicode_combined, $entry; + } + } + else + { + my $entry = {utf8 => ucs2utf($i->{ucs}), + code => $i->{code}, + comment => $i->{comment}, + f => $i->{f}, l => $i->{l}}; + if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode") + { + push @to_unicode, $entry; + } + if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode") + { + push @from_unicode, $entry; + } + } + } + + print_to_utf8_map($charset, \@to_unicode, $verbose); + print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0); + print_from_utf8_map($charset, \@from_unicode, $verbose); + print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0); +} + +sub print_from_utf8_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("utf8_to_${charset}.map"); + print "- Writing UTF8=>${charset} conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". + "static const pg_utf_to_local ULmap${charset}[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code}); + if ($verbose >= 2) + { + $last_comment = "$$i{f}:$$i{l} $$i{comment}"; + } + else + { + $last_comment = $$i{comment}; + } + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_from_utf8_combined_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("utf8_to_${charset}_combined.map"); + print "- Writing UTF8=>${charset} conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n". + "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code}); + $last_comment = "$$i{comment}"; + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_to_utf8_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("${charset}_to_utf8.map"); + + print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". + "static const pg_local_to_utf LUmap${charset}[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8}); + if ($verbose >= 2) + { + $last_comment = "$$i{f}:$$i{l} $$i{comment}"; + } + else + { + $last_comment = $$i{comment}; + } + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +sub print_to_utf8_combined_map +{ + my ($charset, $table, $verbose) = @_; + + my $last_comment = ""; + + my $fname = lc("${charset}_to_utf8_combined.map"); + + print "- Writing ${charset}=>UTF8 conversion table: $fname\n"; + open(my $out, '>', $fname) || die "cannot open output file : $fname\n"; + printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n". + "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", + scalar(@$table)); + my $first = 1; + foreach my $i (sort {$$a{code} <=> $$b{code}} @$table) + { + print($out ",") if (!$first); + $first = 0; + print($out "\t/* $last_comment */") if ($verbose); + + printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second}); + $last_comment = "$$i{comment}"; + } + print($out "\t/* $last_comment */") if ($verbose); + print $out "\n};\n"; + close($out); +} + +1; diff --git a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map index 2c3a607bf8..33fd42ac46 100644 --- a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map +++ b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_local_to_utf LUmapEUC_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map */ + +static const pg_local_to_utf LUmapEUC_JIS_2004[ 11303 ] = { /* */ {0x0080, 0xc280}, /* U+0080 */ {0x0081, 0xc281}, /* U+0081 */ {0x0082, 0xc282}, /* U+0082 */ @@ -205,7 +204,7 @@ static const pg_local_to_utf LUmapEUC_JIS_2004[] = { {0xa2ac, 0xe28691}, /* U+2191 UPWARDS ARROW */ {0xa2ad, 0xe28693}, /* U+2193 DOWNWARDS ARROW */ {0xa2ae, 0xe38093}, /* U+3013 GETA MARK */ - {0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0xa2b0, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */ {0xa2b1, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */ {0xa2b2, 0xefbd9e}, /* U+FF5E FULLWIDTH TILDE [2000] */ diff --git a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map index 7a7f85b105..2d8987b990 100644 --- a/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map +++ b/src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map */ + +static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[ 25 ] = { /* */ {0xa4f7, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */ {0xa4f8, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */ {0xa4f9, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */ diff --git a/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map b/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map index db427cbb24..eb17f9829c 100644 --- a/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map +++ b/src/backend/utils/mb/Unicode/euc_jp_to_utf8.map @@ -1,6 +1,6 @@ /* src/backend/utils/mb/Unicode/euc_jp_to_utf8.map */ -static const pg_local_to_utf LUmapEUC_JP[] = { +static const pg_local_to_utf LUmapEUC_JP[ 13197 ] = { {0x8ea1, 0xefbda1}, {0x8ea2, 0xefbda2}, {0x8ea3, 0xefbda3}, @@ -13197,5 +13197,5 @@ static const pg_local_to_utf LUmapEUC_JP[] = { {0x8ff4fb, 0xe9ab99}, {0x8ff4fc, 0xe9adb2}, {0x8ff4fd, 0xefa8ad}, - {0x8ff4fe, 0xe9bb91}, + {0x8ff4fe, 0xe9bb91} }; diff --git a/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map b/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map index e37152137d..701a7a476f 100644 --- a/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map +++ b/src/backend/utils/mb/Unicode/euc_kr_to_utf8.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/euc_kr_to_utf8.map */ + static const pg_local_to_utf LUmapEUC_KR[ 8227 ] = { {0xa1a1, 0xe38080}, {0xa1a2, 0xe38081}, diff --git a/src/backend/utils/mb/Unicode/johab_to_utf8.map b/src/backend/utils/mb/Unicode/johab_to_utf8.map index 8110f6e853..e31d24184c 100644 --- a/src/backend/utils/mb/Unicode/johab_to_utf8.map +++ b/src/backend/utils/mb/Unicode/johab_to_utf8.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/johab_to_utf8.map */ + static const pg_local_to_utf LUmapJOHAB[ 17049 ] = { {0x8444, 0xe384b3}, {0x8446, 0xe384b5}, diff --git a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map index 81c898c6be..958dde7b83 100644 --- a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map +++ b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFTJIS_2004.pl - */ -static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map */ + +static const pg_local_to_utf LUmapSHIFT_JIS_2004[ 11271 ] = { /* */ {0x00a1, 0xefbda1}, /* U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP */ {0x00a2, 0xefbda2}, /* U+FF62 HALFWIDTH LEFT CORNER BRACKET */ {0x00a3, 0xefbda3}, /* U+FF63 HALFWIDTH RIGHT CORNER BRACKET */ @@ -173,7 +172,7 @@ static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = { {0x81aa, 0xe28691}, /* U+2191 UPWARDS ARROW */ {0x81ab, 0xe28693}, /* U+2193 DOWNWARDS ARROW */ {0x81ac, 0xe38093}, /* U+3013 GETA MARK */ - {0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0x81ae, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */ {0x81af, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */ {0x81b0, 0x7e}, /* U+007E TILDE [2000] Fullwidth: U+FF5E */ diff --git a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map index b1c7bced5f..414e59dc40 100644 --- a/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map +++ b/src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFT_JIS_2004.pl - */ -static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map */ + +static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[ 25 ] = { /* */ {0x82f5, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */ {0x82f6, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */ {0x82f7, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */ diff --git a/src/backend/utils/mb/Unicode/ucs2utf.pl b/src/backend/utils/mb/Unicode/ucs2utf.pl deleted file mode 100644 index e0f1fb226f..0000000000 --- a/src/backend/utils/mb/Unicode/ucs2utf.pl +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (c) 2001-2016, PostgreSQL Global Development Group -# -# src/backend/utils/mb/Unicode/ucs2utf.pl -# convert UCS-4 to UTF-8 -# -sub ucs2utf -{ - local ($ucs) = @_; - local $utf; - - if ($ucs <= 0x007f) - { - $utf = $ucs; - } - elsif ($ucs > 0x007f && $ucs <= 0x07ff) - { - $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); - } - elsif ($ucs > 0x07ff && $ucs <= 0xffff) - { - $utf = - ((($ucs >> 12) | 0xe0) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - else - { - $utf = - ((($ucs >> 18) | 0xf0) << 24) | - (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | - (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); - } - return ($utf); -} -1; diff --git a/src/backend/utils/mb/Unicode/uhc_to_utf8.map b/src/backend/utils/mb/Unicode/uhc_to_utf8.map index 26a7b18f65..65c7e114a3 100644 --- a/src/backend/utils/mb/Unicode/uhc_to_utf8.map +++ b/src/backend/utils/mb/Unicode/uhc_to_utf8.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/uhc_to_utf8.map */ + static const pg_local_to_utf LUmapUHC[ 17237 ] = { {0x8141, 0xeab082}, {0x8142, 0xeab083}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map b/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map index b28eb9cc0c..3d64cd1a60 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_cn.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_euc_cn.map */ + static const pg_utf_to_local ULmapEUC_CN[ 7445 ] = { {0xc2a4, 0xa1e8}, {0xc2a7, 0xa1ec}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map index 5137201217..b50e232b6c 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_utf_to_local ULmapEUC_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map */ + +static const pg_utf_to_local ULmapEUC_JIS_2004[ 11303 ] = { /* */ {0xc280, 0x0080}, /* U+0080 */ {0xc281, 0x0081}, /* U+0081 */ {0xc282, 0x0082}, /* U+0082 */ @@ -10849,7 +10848,7 @@ static const pg_utf_to_local ULmapEUC_JIS_2004[] = { {0xefbc84, 0xa1f0}, /* U+FF04 FULLWIDTH DOLLAR SIGN */ {0xefbc85, 0xa1f3}, /* U+FF05 FULLWIDTH PERCENT SIGN */ {0xefbc86, 0xa1f5}, /* U+FF06 FULLWIDTH AMPERSAND */ - {0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0xefbc88, 0xa1ca}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */ {0xefbc89, 0xa1cb}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */ {0xefbc8a, 0xa1f6}, /* U+FF0A FULLWIDTH ASTERISK */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map index d8ff5c0586..0d57667a55 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_EUC_JIS_2004.pl - */ -static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map */ + +static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[ 25 ] = { /* */ {0x0000c3a6, 0x0000cc80, 0xabc4}, /* U+00E6+0300 [2000] */ {0x0000c994, 0x0000cc80, 0xabc8}, /* U+0254+0300 [2000] */ {0x0000c994, 0x0000cc81, 0xabc9}, /* U+0254+0301 [2000] */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map b/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map index 137d4fdef6..eef6db65b3 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_jp.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_euc_jp.map */ + static const pg_utf_to_local ULmapEUC_JP[ 13175 ] = { {0xc2a1, 0x8fa2c2}, {0xc2a4, 0x8fa2f0}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map b/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map index 4a78b260ea..a642b2154f 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map +++ b/src/backend/utils/mb/Unicode/utf8_to_euc_kr.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_euc_kr.map */ + static const pg_utf_to_local ULmapEUC_KR[ 8227 ] = { {0xc2a1, 0xa2ae}, {0xc2a4, 0xa2b4}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_johab.map b/src/backend/utils/mb/Unicode/utf8_to_johab.map index 869f8213d2..78997d82d0 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_johab.map +++ b/src/backend/utils/mb/Unicode/utf8_to_johab.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_johab.map */ + static const pg_utf_to_local ULmapJOHAB[ 17049 ] = { {0xc2a1, 0xd9ae}, {0xc2a4, 0xd9b4}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map index 4fab64fc95..e9f9e638c6 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map +++ b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFT_JIS_2004.pl - */ -static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = { +/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map */ + +static const pg_utf_to_local ULmapSHIFT_JIS_2004[ 11271 ] = { /* */ {0xc2a0, 0x8541}, /* U+00A0 NO-BREAK SPACE [2000] */ {0xc2a1, 0x8542}, /* U+00A1 INVERTED EXCLAMATION MARK [2000] */ {0xc2a2, 0x8191}, /* U+00A2 CENT SIGN Windows: U+FFE0 */ @@ -10817,7 +10816,7 @@ static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = { {0xefbc84, 0x8190}, /* U+FF04 FULLWIDTH DOLLAR SIGN */ {0xefbc85, 0x8193}, /* U+FF05 FULLWIDTH PERCENT SIGN */ {0xefbc86, 0x8195}, /* U+FF06 FULLWIDTH AMPERSAND */ - {0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */ + {0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE */ {0xefbc88, 0x8169}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */ {0xefbc89, 0x816a}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */ {0xefbc8a, 0x8196}, /* U+FF0A FULLWIDTH ASTERISK */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map index e55d4a2a6c..3642851fd6 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map +++ b/src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map @@ -1,7 +1,6 @@ -/* - * This file was generated by UCS_to_SHIFT_JIS_2004.pl - */ -static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = { +/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map */ + +static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[ 25 ] = { /* */ {0x0000c3a6, 0x0000cc80, 0x8663}, /* U+00E6+0300 [2000] */ {0x0000c994, 0x0000cc80, 0x8667}, /* U+0254+0300 [2000] */ {0x0000c994, 0x0000cc81, 0x8668}, /* U+0254+0301 [2000] */ diff --git a/src/backend/utils/mb/Unicode/utf8_to_sjis.map b/src/backend/utils/mb/Unicode/utf8_to_sjis.map index fb0566a1db..cd6ea48ffc 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_sjis.map +++ b/src/backend/utils/mb/Unicode/utf8_to_sjis.map @@ -3,7 +3,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = { {0xc2a2, 0x8191}, {0xc2a3, 0x8192}, - {0xc2a5, 0x5c}, + {0xc2a5, 0x005c}, {0xc2a7, 0x8198}, {0xc2a8, 0x814e}, {0xc2ac, 0x81ca}, @@ -142,7 +142,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = { {0xe280b2, 0x818c}, {0xe280b3, 0x818d}, {0xe280bb, 0x81a6}, - {0xe280be, 0x7e}, + {0xe280be, 0x007e}, {0xe28483, 0x818e}, {0xe28496, 0xfa59}, {0xe284a1, 0xfa5a}, diff --git a/src/backend/utils/mb/Unicode/utf8_to_uhc.map b/src/backend/utils/mb/Unicode/utf8_to_uhc.map index 15dfb56a09..dc04726364 100644 --- a/src/backend/utils/mb/Unicode/utf8_to_uhc.map +++ b/src/backend/utils/mb/Unicode/utf8_to_uhc.map @@ -1,3 +1,5 @@ +/* src/backend/utils/mb/Unicode/utf8_to_uhc.map */ + static const pg_utf_to_local ULmapUHC[ 17237 ] = { {0xc2a1, 0xa2ae}, {0xc2a4, 0xa2b4},