Fix conversion table generator scripts.

convutils.pm used implicit conversion of undefined value to integer
zero.  Some of conversion scripts are susceptible to regexp greediness.
Fix, avoiding whitespace changes in the output.  Also update ICU URLs
that moved.

No need to back-patch, because the output of these scripts is also in
the source tree so we shouldn't need to rerun them on back-branches.

Author: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJ7SEGLbj%3D%3DTQCcyKRA9aqj8%2B6L%3DexSq1y25TA%3DWxLziQ%40mail.gmail.com
This commit is contained in:
Thomas Munro 2020-07-22 16:38:20 +12:00
parent e47c2602aa
commit a5073871ea
5 changed files with 41 additions and 34 deletions

View File

@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
GB2312.TXT:
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'

View File

@ -24,12 +24,13 @@ my @all;
while (my $line = <$in>)
{
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
{
# combined characters
my ($c, $u1, $u2) = ($1, $2, $3);
my $rest = "U+" . $u1 . "+" . $u2 . $4;
# The "\t \t" below is just to avoid insubstantial diffs.
my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
my $code = hex($c);
my $ucs1 = hex($u1);
my $ucs2 = hex($u2);
@ -45,7 +46,7 @@ while (my $line = <$in>)
l => $.
};
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
{
# non-combined characters

View File

@ -80,7 +80,8 @@ foreach my $i (@$ct932)
}
}
foreach my $i (@mapping)
# extract only SJIS characers
foreach my $i (grep defined $_->{sjis}, @mapping)
{
my $sjis = $i->{sjis};

View File

@ -24,12 +24,13 @@ my @mapping;
while (my $line = <$in>)
{
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
{
# combined characters
my ($c, $u1, $u2) = ($1, $2, $3);
my $rest = "U+" . $u1 . "+" . $u2 . $4;
# The "\t \t" below is just to avoid insubstantial diffs.
my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
my $code = hex($c);
my $ucs1 = hex($u1);
my $ucs2 = hex($u2);
@ -45,7 +46,7 @@ while (my $line = <$in>)
l => $.
};
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
{
# non-combined characters

View File

@ -380,7 +380,8 @@ sub print_radix_table
{
header => "Dummy map, for invalid values",
min_idx => 0,
max_idx => $widest_range
max_idx => $widest_range,
label => "dummy map"
};
###
@ -471,35 +472,37 @@ sub print_radix_table
}
# Also look up the positions of the roots in the table.
my $b1root = $segmap{"1-byte"};
my $b2root = $segmap{"2-byte"};
my $b3root = $segmap{"3-byte"};
my $b4root = $segmap{"4-byte"};
# Missing map represents dummy mapping.
my $b1root = $segmap{"1-byte"} || 0;
my $b2root = $segmap{"2-byte"} || 0;
my $b3root = $segmap{"3-byte"} || 0;
my $b4root = $segmap{"4-byte"} || 0;
# And the lower-upper values of each level in each radix tree.
my $b1_lower = $min_idx{1}{1};
my $b1_upper = $max_idx{1}{1};
# Missing values represent zero.
my $b1_lower = $min_idx{1}{1} || 0;
my $b1_upper = $max_idx{1}{1} || 0;
my $b2_1_lower = $min_idx{2}{1};
my $b2_1_upper = $max_idx{2}{1};
my $b2_2_lower = $min_idx{2}{2};
my $b2_2_upper = $max_idx{2}{2};
my $b2_1_lower = $min_idx{2}{1} || 0;
my $b2_1_upper = $max_idx{2}{1} || 0;
my $b2_2_lower = $min_idx{2}{2} || 0;
my $b2_2_upper = $max_idx{2}{2} || 0;
my $b3_1_lower = $min_idx{3}{1};
my $b3_1_upper = $max_idx{3}{1};
my $b3_2_lower = $min_idx{3}{2};
my $b3_2_upper = $max_idx{3}{2};
my $b3_3_lower = $min_idx{3}{3};
my $b3_3_upper = $max_idx{3}{3};
my $b3_1_lower = $min_idx{3}{1} || 0;
my $b3_1_upper = $max_idx{3}{1} || 0;
my $b3_2_lower = $min_idx{3}{2} || 0;
my $b3_2_upper = $max_idx{3}{2} || 0;
my $b3_3_lower = $min_idx{3}{3} || 0;
my $b3_3_upper = $max_idx{3}{3} || 0;
my $b4_1_lower = $min_idx{4}{1};
my $b4_1_upper = $max_idx{4}{1};
my $b4_2_lower = $min_idx{4}{2};
my $b4_2_upper = $max_idx{4}{2};
my $b4_3_lower = $min_idx{4}{3};
my $b4_3_upper = $max_idx{4}{3};
my $b4_4_lower = $min_idx{4}{4};
my $b4_4_upper = $max_idx{4}{4};
my $b4_1_lower = $min_idx{4}{1} || 0;
my $b4_1_upper = $max_idx{4}{1} || 0;
my $b4_2_lower = $min_idx{4}{2} || 0;
my $b4_2_upper = $max_idx{4}{2} || 0;
my $b4_3_lower = $min_idx{4}{3} || 0;
my $b4_3_upper = $max_idx{4}{3} || 0;
my $b4_4_lower = $min_idx{4}{4} || 0;
my $b4_4_upper = $max_idx{4}{4} || 0;
###
### Find the maximum value in the whole table, to determine if we can
@ -607,7 +610,8 @@ sub print_radix_table
for (my $j = 0;
$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
{
my $val = $seg->{values}->{$i};
# missing values represent zero.
my $val = $seg->{values}->{$i} || 0;
printf $out " 0x%0*x", $colwidth, $val;
$off++;