diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index 9084f03009..da307d8eb9 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt: $(DOWNLOAD) http://x0213.org/codetable/$(@F) gb-18030-2000.xml windows-949-2000.xml: - $(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F) + $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F) GB2312.TXT: $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl index 092a5b44f5..6d1681a18a 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl @@ -24,12 +24,13 @@ my @all; while (my $line = <$in>) { - if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) + if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/) { # combined characters my ($c, $u1, $u2) = ($1, $2, $3); - my $rest = "U+" . $u1 . "+" . $u2 . $4; + # The "\t \t" below is just to avoid insubstantial diffs. + my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4; my $code = hex($c); my $ucs1 = hex($u1); my $ucs2 = hex($u2); @@ -45,7 +46,7 @@ while (my $line = <$in>) l => $. }; } - elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) + elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/) { # non-combined characters diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl index 1d88c0296e..d8bed27e1b 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl @@ -80,7 +80,8 @@ foreach my $i (@$ct932) } } -foreach my $i (@mapping) +# extract only SJIS characers +foreach my $i (grep defined $_->{sjis}, @mapping) { my $sjis = $i->{sjis}; diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl index b516e91306..b86714dd46 100755 --- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl +++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl @@ -24,12 +24,13 @@ my @mapping; while (my $line = <$in>) { - if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) + if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/) { # combined characters my ($c, $u1, $u2) = ($1, $2, $3); - my $rest = "U+" . $u1 . "+" . $u2 . $4; + # The "\t \t" below is just to avoid insubstantial diffs. + my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4; my $code = hex($c); my $ucs1 = hex($u1); my $ucs2 = hex($u2); @@ -45,7 +46,7 @@ while (my $line = <$in>) l => $. }; } - elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) + elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/) { # non-combined characters diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm index 2f64a12ea1..9d97061c6f 100644 --- a/src/backend/utils/mb/Unicode/convutils.pm +++ b/src/backend/utils/mb/Unicode/convutils.pm @@ -380,7 +380,8 @@ sub print_radix_table { header => "Dummy map, for invalid values", min_idx => 0, - max_idx => $widest_range + max_idx => $widest_range, + label => "dummy map" }; ### @@ -471,35 +472,37 @@ sub print_radix_table } # Also look up the positions of the roots in the table. - my $b1root = $segmap{"1-byte"}; - my $b2root = $segmap{"2-byte"}; - my $b3root = $segmap{"3-byte"}; - my $b4root = $segmap{"4-byte"}; + # Missing map represents dummy mapping. + my $b1root = $segmap{"1-byte"} || 0; + my $b2root = $segmap{"2-byte"} || 0; + my $b3root = $segmap{"3-byte"} || 0; + my $b4root = $segmap{"4-byte"} || 0; # And the lower-upper values of each level in each radix tree. - my $b1_lower = $min_idx{1}{1}; - my $b1_upper = $max_idx{1}{1}; + # Missing values represent zero. + my $b1_lower = $min_idx{1}{1} || 0; + my $b1_upper = $max_idx{1}{1} || 0; - my $b2_1_lower = $min_idx{2}{1}; - my $b2_1_upper = $max_idx{2}{1}; - my $b2_2_lower = $min_idx{2}{2}; - my $b2_2_upper = $max_idx{2}{2}; + my $b2_1_lower = $min_idx{2}{1} || 0; + my $b2_1_upper = $max_idx{2}{1} || 0; + my $b2_2_lower = $min_idx{2}{2} || 0; + my $b2_2_upper = $max_idx{2}{2} || 0; - my $b3_1_lower = $min_idx{3}{1}; - my $b3_1_upper = $max_idx{3}{1}; - my $b3_2_lower = $min_idx{3}{2}; - my $b3_2_upper = $max_idx{3}{2}; - my $b3_3_lower = $min_idx{3}{3}; - my $b3_3_upper = $max_idx{3}{3}; + my $b3_1_lower = $min_idx{3}{1} || 0; + my $b3_1_upper = $max_idx{3}{1} || 0; + my $b3_2_lower = $min_idx{3}{2} || 0; + my $b3_2_upper = $max_idx{3}{2} || 0; + my $b3_3_lower = $min_idx{3}{3} || 0; + my $b3_3_upper = $max_idx{3}{3} || 0; - my $b4_1_lower = $min_idx{4}{1}; - my $b4_1_upper = $max_idx{4}{1}; - my $b4_2_lower = $min_idx{4}{2}; - my $b4_2_upper = $max_idx{4}{2}; - my $b4_3_lower = $min_idx{4}{3}; - my $b4_3_upper = $max_idx{4}{3}; - my $b4_4_lower = $min_idx{4}{4}; - my $b4_4_upper = $max_idx{4}{4}; + my $b4_1_lower = $min_idx{4}{1} || 0; + my $b4_1_upper = $max_idx{4}{1} || 0; + my $b4_2_lower = $min_idx{4}{2} || 0; + my $b4_2_upper = $max_idx{4}{2} || 0; + my $b4_3_lower = $min_idx{4}{3} || 0; + my $b4_3_upper = $max_idx{4}{3} || 0; + my $b4_4_lower = $min_idx{4}{4} || 0; + my $b4_4_upper = $max_idx{4}{4} || 0; ### ### Find the maximum value in the whole table, to determine if we can @@ -607,7 +610,8 @@ sub print_radix_table for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) { - my $val = $seg->{values}->{$i}; + # missing values represent zero. + my $val = $seg->{values}->{$i} || 0; printf $out " 0x%0*x", $colwidth, $val; $off++;