2016-11-30 13:54:02 +01:00
|
|
|
#
|
2022-01-08 01:04:57 +01:00
|
|
|
# Copyright (c) 2001-2022, PostgreSQL Global Development Group
|
2016-11-30 13:54:02 +01:00
|
|
|
#
|
|
|
|
# src/backend/utils/mb/Unicode/convutils.pm
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
package convutils;
|
|
|
|
|
2016-11-30 13:54:02 +01:00
|
|
|
use strict;
|
2020-04-13 17:55:45 +02:00
|
|
|
use warnings;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2018-02-24 20:35:54 +01:00
|
|
|
use Carp;
|
2017-03-13 19:46:39 +01:00
|
|
|
use Exporter 'import';
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
our @EXPORT =
|
|
|
|
qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
|
|
|
|
|
|
|
|
# Constants used in the 'direction' field of the character maps
|
|
|
|
use constant {
|
|
|
|
NONE => 0,
|
|
|
|
TO_UNICODE => 1,
|
|
|
|
FROM_UNICODE => 2,
|
2018-05-09 16:14:46 +02:00
|
|
|
BOTH => 3
|
|
|
|
};
|
2016-11-30 13:54:02 +01:00
|
|
|
|
|
|
|
#######################################################################
|
|
|
|
# read_source - common routine to read source file
|
|
|
|
#
|
|
|
|
# fname ; input file name
|
2017-03-13 19:46:39 +01:00
|
|
|
#
|
2016-11-30 13:54:02 +01:00
|
|
|
sub read_source
|
|
|
|
{
|
|
|
|
my ($fname) = @_;
|
|
|
|
my @r;
|
|
|
|
|
|
|
|
open(my $in, '<', $fname) || die("cannot open $fname");
|
|
|
|
|
|
|
|
while (<$in>)
|
|
|
|
{
|
|
|
|
next if (/^#/);
|
|
|
|
chop;
|
|
|
|
|
|
|
|
next if (/^$/); # Ignore empty lines
|
|
|
|
|
|
|
|
next if (/^0x([0-9A-F]+)\s+(#.*)$/);
|
|
|
|
|
2017-03-13 18:04:45 +01:00
|
|
|
# The Unicode source files have three columns
|
|
|
|
# 1: The "foreign" code (in hex)
|
|
|
|
# 2: Unicode code point (in hex)
|
|
|
|
# 3: Unicode name
|
|
|
|
if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/)
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
|
|
|
print STDERR "READ ERROR at line $. in $fname: $_\n";
|
|
|
|
exit;
|
|
|
|
}
|
2017-04-07 18:19:50 +02:00
|
|
|
my $out = {
|
|
|
|
code => hex($1),
|
2016-11-30 13:54:02 +01:00
|
|
|
ucs => hex($2),
|
|
|
|
comment => $4,
|
2017-03-13 19:46:39 +01:00
|
|
|
direction => BOTH,
|
|
|
|
f => $fname,
|
2018-05-09 16:14:46 +02:00
|
|
|
l => $.
|
|
|
|
};
|
2016-11-30 13:54:02 +01:00
|
|
|
|
|
|
|
# Ignore pure ASCII mappings. PostgreSQL character conversion code
|
|
|
|
# never even passes these to the conversion code.
|
|
|
|
next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
|
|
|
|
|
|
|
|
push(@r, $out);
|
|
|
|
}
|
|
|
|
close($in);
|
|
|
|
|
|
|
|
return \@r;
|
|
|
|
}
|
|
|
|
|
|
|
|
##################################################################
|
2017-03-13 19:46:39 +01:00
|
|
|
# print_conversion_tables - output mapping tables
|
2016-11-30 13:54:02 +01:00
|
|
|
#
|
2017-03-13 19:46:39 +01:00
|
|
|
# print_conversion_tables($this_script, $csname, \%charset)
|
2016-11-30 13:54:02 +01:00
|
|
|
#
|
2017-03-13 19:46:39 +01:00
|
|
|
# this_script - the name of the *caller script* of this feature
|
|
|
|
# csname - character set name other than ucs
|
|
|
|
# charset - ref to character set array
|
2016-11-30 13:54:02 +01:00
|
|
|
#
|
2017-03-13 19:46:39 +01:00
|
|
|
# Input character set array format:
|
2016-11-30 13:54:02 +01:00
|
|
|
#
|
2017-03-13 19:46:39 +01:00
|
|
|
# Each element in the character set array is a hash. Each hash has the following fields:
|
|
|
|
# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
|
2016-11-30 13:54:02 +01:00
|
|
|
# ucs - Unicode code point
|
|
|
|
# ucs_second - Second Unicode code point, if this is a "combined" character.
|
|
|
|
# code - Byte sequence in the "other" character set, as an integer
|
|
|
|
# comment - Text representation of the character
|
|
|
|
# f - Source filename
|
|
|
|
# l - Line number in source file
|
|
|
|
#
|
2017-03-13 19:46:39 +01:00
|
|
|
sub print_conversion_tables
|
|
|
|
{
|
|
|
|
my ($this_script, $csname, $charset) = @_;
|
|
|
|
|
|
|
|
print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
|
|
|
|
$charset);
|
|
|
|
print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
|
|
|
|
$charset);
|
2018-05-27 15:08:42 +02:00
|
|
|
return;
|
2017-03-13 19:46:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#############################################################################
|
|
|
|
# INTERNAL ROUTINES
|
|
|
|
|
|
|
|
#######################################################################
|
|
|
|
# print_conversion_tables_direction - write the whole content of C source of radix tree
|
|
|
|
#
|
|
|
|
# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
|
|
|
|
#
|
|
|
|
# this_script - the name of the *caller script* of this feature
|
|
|
|
# csname - character set name other than ucs
|
|
|
|
# direction - desired direction, TO_UNICODE or FROM_UNICODE
|
|
|
|
# charset - ref to character set array
|
2016-11-30 13:54:02 +01:00
|
|
|
#
|
2017-03-13 19:46:39 +01:00
|
|
|
sub print_conversion_tables_direction
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
my ($this_script, $csname, $direction, $charset) = @_;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
my $fname;
|
|
|
|
my $tblname;
|
|
|
|
if ($direction == TO_UNICODE)
|
|
|
|
{
|
|
|
|
$fname = lc("${csname}_to_utf8.map");
|
|
|
|
$tblname = lc("${csname}_to_unicode_tree");
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
|
|
|
|
}
|
|
|
|
else
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
$fname = lc("utf8_to_${csname}.map");
|
|
|
|
$tblname = lc("${csname}_from_unicode_tree");
|
|
|
|
|
|
|
|
print "- Writing UTF8=>${csname} conversion table: $fname\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
open(my $out, '>', $fname) || die("cannot open $fname");
|
|
|
|
|
|
|
|
print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
|
|
|
|
print $out "/* This file is generated by $this_script */\n\n";
|
|
|
|
|
2018-04-26 20:13:46 +02:00
|
|
|
# Collect regular, non-combined, mappings, and create the radix tree from them.
|
2017-03-13 19:46:39 +01:00
|
|
|
my $charmap = &make_charmap($out, $charset, $direction, 0);
|
|
|
|
print_radix_table($out, $tblname, $charmap);
|
|
|
|
|
2018-04-26 20:13:46 +02:00
|
|
|
# Collect combined characters, and create combined character table (if any)
|
2017-03-13 19:46:39 +01:00
|
|
|
my $charmap_combined = &make_charmap_combined($charset, $direction);
|
|
|
|
|
|
|
|
if (scalar @{$charmap_combined} > 0)
|
|
|
|
{
|
|
|
|
if ($direction == TO_UNICODE)
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
close($out);
|
2018-05-27 15:08:42 +02:00
|
|
|
return;
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
sub print_from_utf8_combined_map
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
my ($out, $charset, $table, $verbose) = @_;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
|
|
|
my $last_comment = "";
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
printf $out "\n/* Combined character map */\n";
|
|
|
|
printf $out
|
2021-07-19 09:46:50 +02:00
|
|
|
"static const pg_utf_to_local_combined ULmap${charset}_combined[%d] = {",
|
2017-03-13 19:46:39 +01:00
|
|
|
scalar(@$table);
|
2016-11-30 13:54:02 +01:00
|
|
|
my $first = 1;
|
2017-02-01 10:23:53 +01:00
|
|
|
foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
|
|
|
print($out ",") if (!$first);
|
|
|
|
$first = 0;
|
2017-03-13 19:46:39 +01:00
|
|
|
print $out "\t/* $last_comment */"
|
|
|
|
if ($verbose && $last_comment ne "");
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
printf $out "\n {0x%08x, 0x%08x, 0x%04x}",
|
|
|
|
$i->{utf8}, $i->{utf8_second}, $i->{code};
|
2016-11-30 13:54:02 +01:00
|
|
|
if ($verbose >= 2)
|
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
$last_comment =
|
|
|
|
sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
2017-03-13 19:46:39 +01:00
|
|
|
elsif ($verbose >= 1)
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-02-01 10:23:53 +01:00
|
|
|
$last_comment = $i->{comment};
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
}
|
2017-03-13 19:46:39 +01:00
|
|
|
print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
|
2016-11-30 13:54:02 +01:00
|
|
|
print $out "\n};\n";
|
2018-05-27 15:08:42 +02:00
|
|
|
return;
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
sub print_to_utf8_combined_map
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
my ($out, $charset, $table, $verbose) = @_;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
|
|
|
my $last_comment = "";
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
printf $out "\n/* Combined character map */\n";
|
|
|
|
printf $out
|
2021-07-19 09:46:50 +02:00
|
|
|
"static const pg_local_to_utf_combined LUmap${charset}_combined[%d] = {",
|
2017-03-13 19:46:39 +01:00
|
|
|
scalar(@$table);
|
|
|
|
|
2016-11-30 13:54:02 +01:00
|
|
|
my $first = 1;
|
2017-03-13 19:46:39 +01:00
|
|
|
foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
|
|
|
print($out ",") if (!$first);
|
|
|
|
$first = 0;
|
2017-03-13 19:46:39 +01:00
|
|
|
print $out "\t/* $last_comment */"
|
|
|
|
if ($verbose && $last_comment ne "");
|
|
|
|
|
|
|
|
printf $out "\n {0x%04x, 0x%08x, 0x%08x}",
|
|
|
|
$i->{code}, $i->{utf8}, $i->{utf8_second};
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
if ($verbose >= 2)
|
|
|
|
{
|
|
|
|
$last_comment =
|
|
|
|
sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
|
|
|
|
}
|
|
|
|
elsif ($verbose >= 1)
|
|
|
|
{
|
|
|
|
$last_comment = $i->{comment};
|
|
|
|
}
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
2017-03-13 19:46:39 +01:00
|
|
|
print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
|
2016-11-30 13:54:02 +01:00
|
|
|
print $out "\n};\n";
|
2018-05-27 15:08:42 +02:00
|
|
|
return;
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
#######################################################################
|
|
|
|
# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
|
|
|
|
#
|
|
|
|
# Input: A hash, mapping an input character to an output character.
|
|
|
|
#
|
|
|
|
# Constructs a radix tree from the hash, and prints it out as a C-struct.
|
|
|
|
#
|
|
|
|
sub print_radix_table
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
my ($out, $tblname, $c) = @_;
|
|
|
|
|
|
|
|
###
|
|
|
|
### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
|
|
|
|
### radix tree is represented as a nested hash, each hash indexed by
|
|
|
|
### input byte
|
|
|
|
###
|
|
|
|
my %b1map;
|
|
|
|
my %b2map;
|
|
|
|
my %b3map;
|
|
|
|
my %b4map;
|
|
|
|
foreach my $in (keys %$c)
|
|
|
|
{
|
|
|
|
my $out = $c->{$in};
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2018-05-27 16:41:19 +02:00
|
|
|
if ($in <= 0xff)
|
2017-03-13 19:46:39 +01:00
|
|
|
{
|
|
|
|
$b1map{$in} = $out;
|
|
|
|
}
|
2018-05-27 16:41:19 +02:00
|
|
|
elsif ($in <= 0xffff)
|
2017-03-13 19:46:39 +01:00
|
|
|
{
|
|
|
|
my $b1 = $in >> 8;
|
|
|
|
my $b2 = $in & 0xff;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
$b2map{$b1}{$b2} = $out;
|
|
|
|
}
|
2018-05-27 16:41:19 +02:00
|
|
|
elsif ($in <= 0xffffff)
|
2017-03-13 19:46:39 +01:00
|
|
|
{
|
|
|
|
my $b1 = $in >> 16;
|
|
|
|
my $b2 = ($in >> 8) & 0xff;
|
|
|
|
my $b3 = $in & 0xff;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
$b3map{$b1}{$b2}{$b3} = $out;
|
|
|
|
}
|
2018-05-27 16:41:19 +02:00
|
|
|
elsif ($in <= 0xffffffff)
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
my $b1 = $in >> 24;
|
|
|
|
my $b2 = ($in >> 16) & 0xff;
|
|
|
|
my $b3 = ($in >> 8) & 0xff;
|
|
|
|
my $b4 = $in & 0xff;
|
|
|
|
|
|
|
|
$b4map{$b1}{$b2}{$b3}{$b4} = $out;
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
die sprintf("up to 4 byte code is supported: %x", $in);
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
}
|
2017-03-13 19:46:39 +01:00
|
|
|
|
|
|
|
my @segments;
|
|
|
|
|
|
|
|
###
|
|
|
|
### Build a linear list of "segments", from the nested hashes.
|
|
|
|
###
|
|
|
|
### Each segment is a lookup table, keyed by the next byte in the input.
|
|
|
|
### The segments are written out physically to one big array in the final
|
|
|
|
### step, but logically, they form a radix tree. Or rather, four radix
|
|
|
|
### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
|
|
|
|
### inputs, and 4-byte inputs.
|
|
|
|
###
|
|
|
|
### Each segment is represented by a hash with following fields:
|
|
|
|
###
|
|
|
|
### comment => <string to output as a comment>
|
|
|
|
### label => <label that can be used to refer to this segment from elsewhere>
|
|
|
|
### values => <a hash, keyed by byte, 0-0xff>
|
|
|
|
###
|
|
|
|
### Entries in 'values' can be integers (for leaf-level segments), or
|
|
|
|
### string labels, pointing to a segment with that label. Any missing
|
|
|
|
### values are treated as zeros. If 'values' hash is missing altogether,
|
|
|
|
### it's treated as all-zeros.
|
|
|
|
###
|
|
|
|
### Subsequent steps will enrich the segments with more fields.
|
|
|
|
###
|
|
|
|
|
|
|
|
# Add the segments for the radix trees themselves.
|
|
|
|
push @segments,
|
|
|
|
build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
|
|
|
|
push @segments,
|
|
|
|
build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
|
|
|
|
push @segments,
|
|
|
|
build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
|
|
|
|
push @segments,
|
|
|
|
build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
|
|
|
|
|
|
|
|
###
|
|
|
|
### Find min and max index used in each level of each tree.
|
|
|
|
###
|
|
|
|
### These are stored separately, and we can then leave out the unused
|
|
|
|
### parts of every segment. (When using the resulting tree, you must
|
|
|
|
### check each input byte against the min and max.)
|
|
|
|
###
|
|
|
|
my %min_idx;
|
|
|
|
my %max_idx;
|
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
|
|
|
|
my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };
|
|
|
|
|
|
|
|
foreach my $i (keys %{ $seg->{values} })
|
|
|
|
{
|
|
|
|
$this_min = $i if (!defined $this_min || $i < $this_min);
|
|
|
|
$this_max = $i if (!defined $this_max || $i > $this_max);
|
|
|
|
}
|
|
|
|
|
|
|
|
$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
|
|
|
|
$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
|
|
|
|
}
|
2017-05-18 01:01:23 +02:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
# Copy the mins and max's back to every segment, for convenience.
|
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
|
|
|
|
$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
|
|
|
|
}
|
|
|
|
|
|
|
|
###
|
|
|
|
### Prepend a dummy all-zeros map to the beginning.
|
|
|
|
###
|
|
|
|
### A 0 is an invalid value anywhere in the table, and this allows us to
|
|
|
|
### point to 0 offset from any table, to get a 0 result.
|
|
|
|
###
|
|
|
|
|
|
|
|
# Find the max range between min and max indexes in any of the segments.
|
|
|
|
my $widest_range = 0;
|
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
my $this_range = $seg->{max_idx} - $seg->{min_idx};
|
|
|
|
$widest_range = $this_range if ($this_range > $widest_range);
|
|
|
|
}
|
|
|
|
|
|
|
|
unshift @segments,
|
2018-05-09 16:14:46 +02:00
|
|
|
{
|
|
|
|
header => "Dummy map, for invalid values",
|
2017-03-13 19:46:39 +01:00
|
|
|
min_idx => 0,
|
2020-07-22 06:38:20 +02:00
|
|
|
max_idx => $widest_range,
|
|
|
|
label => "dummy map"
|
2018-05-09 16:14:46 +02:00
|
|
|
};
|
2017-03-13 19:46:39 +01:00
|
|
|
|
|
|
|
###
|
|
|
|
### Eliminate overlapping zeros
|
|
|
|
###
|
|
|
|
### For each segment, if there are zero values at the end of, and there
|
|
|
|
### are also zero values at the beginning of the next segment, we can
|
|
|
|
### overlay the tail of this segment with the head of next segment, to
|
|
|
|
### save space.
|
|
|
|
###
|
|
|
|
### To achieve that, we subtract the 'max_idx' of each segment with the
|
|
|
|
### amount of zeros that can be overlaid.
|
|
|
|
###
|
|
|
|
for (my $j = 0; $j < $#segments - 1; $j++)
|
|
|
|
{
|
|
|
|
my $seg = $segments[$j];
|
|
|
|
my $nextseg = $segments[ $j + 1 ];
|
|
|
|
|
|
|
|
# Count the number of zero values at the end of this segment.
|
|
|
|
my $this_trail_zeros = 0;
|
|
|
|
for (
|
|
|
|
my $i = $seg->{max_idx};
|
|
|
|
$i >= $seg->{min_idx} && !$seg->{values}->{$i};
|
|
|
|
$i--)
|
|
|
|
{
|
|
|
|
$this_trail_zeros++;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Count the number of zeros at the beginning of next segment.
|
|
|
|
my $next_lead_zeros = 0;
|
|
|
|
for (
|
|
|
|
my $i = $nextseg->{min_idx};
|
|
|
|
$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
|
|
|
|
$i++)
|
|
|
|
{
|
|
|
|
$next_lead_zeros++;
|
|
|
|
}
|
|
|
|
|
|
|
|
# How many zeros in common?
|
|
|
|
my $overlaid_trail_zeros =
|
|
|
|
($this_trail_zeros > $next_lead_zeros)
|
|
|
|
? $next_lead_zeros
|
|
|
|
: $this_trail_zeros;
|
|
|
|
|
|
|
|
$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
|
|
|
|
$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
|
|
|
|
}
|
|
|
|
|
|
|
|
###
|
|
|
|
### Replace label references with real offsets.
|
|
|
|
###
|
|
|
|
### So far, the non-leaf segments have referred to other segments by
|
|
|
|
### their labels. Replace them with numerical offsets from the beginning
|
|
|
|
### of the final array. You cannot move, add, or remove segments after
|
|
|
|
### this step, as that would invalidate the offsets calculated here!
|
|
|
|
###
|
|
|
|
my $flatoff = 0;
|
|
|
|
my %segmap;
|
|
|
|
|
|
|
|
# First pass: assign offsets to each segment, and build hash
|
|
|
|
# of label => offset.
|
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
$seg->{offset} = $flatoff;
|
|
|
|
$segmap{ $seg->{label} } = $flatoff;
|
|
|
|
$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
|
|
|
|
}
|
|
|
|
my $tblsize = $flatoff;
|
|
|
|
|
|
|
|
# Second pass: look up the offset of each label reference in the hash.
|
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
while (my ($i, $val) = each %{ $seg->{values} })
|
|
|
|
{
|
|
|
|
if (!($val =~ /^[0-9,.E]+$/))
|
|
|
|
{
|
|
|
|
my $segoff = $segmap{$val};
|
|
|
|
if ($segoff)
|
|
|
|
{
|
|
|
|
$seg->{values}->{$i} = $segoff;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
die "no segment with label $val";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Also look up the positions of the roots in the table.
|
2020-07-22 06:38:20 +02:00
|
|
|
# Missing map represents dummy mapping.
|
|
|
|
my $b1root = $segmap{"1-byte"} || 0;
|
|
|
|
my $b2root = $segmap{"2-byte"} || 0;
|
|
|
|
my $b3root = $segmap{"3-byte"} || 0;
|
|
|
|
my $b4root = $segmap{"4-byte"} || 0;
|
2017-03-13 19:46:39 +01:00
|
|
|
|
|
|
|
# And the lower-upper values of each level in each radix tree.
|
2020-07-22 06:38:20 +02:00
|
|
|
# Missing values represent zero.
|
|
|
|
my $b1_lower = $min_idx{1}{1} || 0;
|
|
|
|
my $b1_upper = $max_idx{1}{1} || 0;
|
|
|
|
|
|
|
|
my $b2_1_lower = $min_idx{2}{1} || 0;
|
|
|
|
my $b2_1_upper = $max_idx{2}{1} || 0;
|
|
|
|
my $b2_2_lower = $min_idx{2}{2} || 0;
|
|
|
|
my $b2_2_upper = $max_idx{2}{2} || 0;
|
|
|
|
|
|
|
|
my $b3_1_lower = $min_idx{3}{1} || 0;
|
|
|
|
my $b3_1_upper = $max_idx{3}{1} || 0;
|
|
|
|
my $b3_2_lower = $min_idx{3}{2} || 0;
|
|
|
|
my $b3_2_upper = $max_idx{3}{2} || 0;
|
|
|
|
my $b3_3_lower = $min_idx{3}{3} || 0;
|
|
|
|
my $b3_3_upper = $max_idx{3}{3} || 0;
|
|
|
|
|
|
|
|
my $b4_1_lower = $min_idx{4}{1} || 0;
|
|
|
|
my $b4_1_upper = $max_idx{4}{1} || 0;
|
|
|
|
my $b4_2_lower = $min_idx{4}{2} || 0;
|
|
|
|
my $b4_2_upper = $max_idx{4}{2} || 0;
|
|
|
|
my $b4_3_lower = $min_idx{4}{3} || 0;
|
|
|
|
my $b4_3_upper = $max_idx{4}{3} || 0;
|
|
|
|
my $b4_4_lower = $min_idx{4}{4} || 0;
|
|
|
|
my $b4_4_upper = $max_idx{4}{4} || 0;
|
2017-03-13 19:46:39 +01:00
|
|
|
|
|
|
|
###
|
|
|
|
### Find the maximum value in the whole table, to determine if we can
|
|
|
|
### use uint16 or if we need to use uint32.
|
|
|
|
###
|
|
|
|
my $max_val = 0;
|
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
foreach my $val (values %{ $seg->{values} })
|
|
|
|
{
|
|
|
|
$max_val = $val if ($val > $max_val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";
|
|
|
|
|
|
|
|
# For formatting, determine how many values we can fit on a single
|
|
|
|
# line, and how wide each value needs to be to align nicely.
|
|
|
|
my $vals_per_line;
|
|
|
|
my $colwidth;
|
|
|
|
|
|
|
|
if ($max_val <= 0xffff)
|
|
|
|
{
|
|
|
|
$vals_per_line = 8;
|
|
|
|
$colwidth = 4;
|
|
|
|
}
|
|
|
|
elsif ($max_val <= 0xffffff)
|
|
|
|
{
|
|
|
|
$vals_per_line = 4;
|
|
|
|
$colwidth = 6;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
$vals_per_line = 4;
|
|
|
|
$colwidth = 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
###
|
|
|
|
### Print the struct and array.
|
|
|
|
###
|
2017-03-13 20:53:38 +01:00
|
|
|
printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
|
2017-03-13 19:46:39 +01:00
|
|
|
printf $out "\n";
|
|
|
|
printf $out "static const pg_mb_radix_tree $tblname =\n";
|
|
|
|
printf $out "{\n";
|
|
|
|
if ($datatype eq "uint16")
|
|
|
|
{
|
|
|
|
print $out " ${tblname}_table,\n";
|
|
|
|
print $out " NULL, /* 32-bit table not used */\n";
|
|
|
|
}
|
|
|
|
if ($datatype eq "uint32")
|
|
|
|
{
|
|
|
|
print $out " NULL, /* 16-bit table not used */\n";
|
|
|
|
print $out " ${tblname}_table,\n";
|
|
|
|
}
|
|
|
|
printf $out "\n";
|
|
|
|
printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n",
|
|
|
|
$b1root;
|
|
|
|
printf $out " 0x%02x, /* b1_lower */\n", $b1_lower;
|
|
|
|
printf $out " 0x%02x, /* b1_upper */\n", $b1_upper;
|
|
|
|
printf $out "\n";
|
|
|
|
printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n",
|
|
|
|
$b2root;
|
|
|
|
printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower;
|
|
|
|
printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper;
|
|
|
|
printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower;
|
|
|
|
printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper;
|
|
|
|
printf $out "\n";
|
|
|
|
printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n",
|
|
|
|
$b3root;
|
|
|
|
printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower;
|
|
|
|
printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper;
|
|
|
|
printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower;
|
|
|
|
printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper;
|
|
|
|
printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower;
|
|
|
|
printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper;
|
|
|
|
printf $out "\n";
|
2021-07-22 09:37:35 +02:00
|
|
|
printf $out " 0x%04x, /* offset of table for 4-byte inputs */\n",
|
2017-03-13 19:46:39 +01:00
|
|
|
$b4root;
|
|
|
|
printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower;
|
|
|
|
printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper;
|
|
|
|
printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower;
|
|
|
|
printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper;
|
|
|
|
printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower;
|
|
|
|
printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper;
|
|
|
|
printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower;
|
|
|
|
printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper;
|
|
|
|
print $out "};\n";
|
|
|
|
print $out "\n";
|
|
|
|
print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
|
|
|
|
print $out "{";
|
|
|
|
my $off = 0;
|
2017-05-18 01:01:23 +02:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
foreach my $seg (@segments)
|
|
|
|
{
|
|
|
|
printf $out "\n";
|
|
|
|
printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
|
|
|
|
printf $out "\n";
|
|
|
|
|
|
|
|
for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
|
|
|
|
{
|
2017-05-18 01:01:23 +02:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
# Print the next line's worth of values.
|
|
|
|
# XXX pad to begin at a nice boundary
|
|
|
|
printf $out " /* %02x */ ", $i;
|
|
|
|
for (my $j = 0;
|
|
|
|
$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
|
|
|
|
{
|
2020-07-22 06:38:20 +02:00
|
|
|
# missing values represent zero.
|
|
|
|
my $val = $seg->{values}->{$i} || 0;
|
2017-03-13 19:46:39 +01:00
|
|
|
|
|
|
|
printf $out " 0x%0*x", $colwidth, $val;
|
|
|
|
$off++;
|
|
|
|
if ($off != $tblsize)
|
|
|
|
{
|
|
|
|
print $out ",";
|
|
|
|
}
|
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
print $out "\n";
|
|
|
|
}
|
|
|
|
if ($seg->{overlaid_trail_zeros})
|
|
|
|
{
|
|
|
|
printf $out
|
2018-04-26 17:52:52 +02:00
|
|
|
" /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
|
2017-03-13 19:46:39 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Sanity check.
|
|
|
|
if ($off != $tblsize) { die "table size didn't match!"; }
|
|
|
|
|
|
|
|
print $out "};\n";
|
2018-05-27 15:08:42 +02:00
|
|
|
return;
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
###
|
|
|
|
sub build_segments_from_tree
|
2016-11-30 13:54:02 +01:00
|
|
|
{
|
2017-03-13 19:46:39 +01:00
|
|
|
my ($header, $rootlabel, $depth, $map) = @_;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
my @segments;
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
if (%{$map})
|
|
|
|
{
|
|
|
|
@segments =
|
|
|
|
build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
# Sort the segments into "breadth-first" order. Not strictly required,
|
|
|
|
# but makes the maps nicer to read.
|
|
|
|
@segments =
|
|
|
|
sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
|
|
|
|
@segments;
|
|
|
|
}
|
|
|
|
|
|
|
|
return @segments;
|
|
|
|
}
|
2016-11-30 13:54:02 +01:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
###
|
|
|
|
sub build_segments_recurse
|
|
|
|
{
|
|
|
|
my ($header, $label, $path, $level, $depth, $map) = @_;
|
|
|
|
|
|
|
|
my @segments;
|
|
|
|
|
|
|
|
if ($level == $depth)
|
|
|
|
{
|
|
|
|
push @segments,
|
2018-05-09 16:14:46 +02:00
|
|
|
{
|
|
|
|
header => $header . ", leaf: ${path}xx",
|
2017-03-13 19:46:39 +01:00
|
|
|
label => $label,
|
|
|
|
level => $level,
|
|
|
|
depth => $depth,
|
|
|
|
path => $path,
|
2018-05-09 16:14:46 +02:00
|
|
|
values => $map
|
|
|
|
};
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
2017-03-13 19:46:39 +01:00
|
|
|
else
|
|
|
|
{
|
|
|
|
my %children;
|
|
|
|
|
|
|
|
while (my ($i, $val) = each %$map)
|
|
|
|
{
|
|
|
|
my $childpath = $path . sprintf("%02x", $i);
|
|
|
|
my $childlabel = "$depth-level-$level-$childpath";
|
|
|
|
|
|
|
|
push @segments,
|
|
|
|
build_segments_recurse($header, $childlabel, $childpath,
|
|
|
|
$level + 1, $depth, $val);
|
|
|
|
$children{$i} = $childlabel;
|
|
|
|
}
|
|
|
|
|
|
|
|
push @segments,
|
2018-05-09 16:14:46 +02:00
|
|
|
{
|
|
|
|
header => $header . ", byte #$level: ${path}xx",
|
2017-03-13 19:46:39 +01:00
|
|
|
label => $label,
|
|
|
|
level => $level,
|
|
|
|
depth => $depth,
|
|
|
|
path => $path,
|
2018-05-09 16:14:46 +02:00
|
|
|
values => \%children
|
|
|
|
};
|
2017-03-13 19:46:39 +01:00
|
|
|
}
|
|
|
|
return @segments;
|
|
|
|
}
|
|
|
|
|
|
|
|
#######################################################################
|
|
|
|
# make_charmap - convert charset table to charmap hash
|
|
|
|
#
|
|
|
|
# make_charmap(\@charset, $direction)
|
|
|
|
# charset - ref to charset table : see print_conversion_tables
|
|
|
|
# direction - conversion direction
|
|
|
|
#
|
|
|
|
sub make_charmap
|
|
|
|
{
|
|
|
|
my ($out, $charset, $direction, $verbose) = @_;
|
|
|
|
|
2018-02-24 20:35:54 +01:00
|
|
|
croak "unacceptable direction : $direction"
|
2017-03-13 19:46:39 +01:00
|
|
|
if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
|
|
|
|
|
|
|
|
# In verbose mode, print a large comment with the source and comment of
|
|
|
|
# each character
|
|
|
|
if ($verbose)
|
|
|
|
{
|
|
|
|
print $out "/*\n";
|
|
|
|
print $out "<src> <dst> <file>:<lineno> <comment>\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
my %charmap;
|
|
|
|
foreach my $c (@$charset)
|
|
|
|
{
|
2017-05-18 01:01:23 +02:00
|
|
|
|
2017-03-13 19:46:39 +01:00
|
|
|
# combined characters are handled elsewhere
|
|
|
|
next if (defined $c->{ucs_second});
|
|
|
|
|
|
|
|
next if ($c->{direction} != $direction && $c->{direction} != BOTH);
|
|
|
|
|
|
|
|
my ($src, $dst) =
|
|
|
|
$direction == TO_UNICODE
|
|
|
|
? ($c->{code}, ucs2utf($c->{ucs}))
|
|
|
|
: (ucs2utf($c->{ucs}), $c->{code});
|
|
|
|
|
|
|
|
# check for duplicate source codes
|
|
|
|
if (defined $charmap{$src})
|
|
|
|
{
|
|
|
|
printf STDERR
|
2018-04-26 17:52:52 +02:00
|
|
|
"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
|
2017-03-13 19:46:39 +01:00
|
|
|
$c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
$charmap{$src} = $dst;
|
|
|
|
|
|
|
|
if ($verbose)
|
|
|
|
{
|
|
|
|
printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
|
|
|
|
$c->{l}, $c->{comment};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ($verbose)
|
|
|
|
{
|
|
|
|
print $out "*/\n\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
return \%charmap;
|
|
|
|
}
|
|
|
|
|
|
|
|
#######################################################################
|
|
|
|
# make_charmap_combined - convert charset table to charmap hash
|
|
|
|
# with checking duplicate source code
|
|
|
|
#
|
|
|
|
# make_charmap_combined(\@charset, $direction)
|
|
|
|
# charset - ref to charset table : see print_conversion_tables
|
|
|
|
# direction - conversion direction
|
|
|
|
#
|
|
|
|
sub make_charmap_combined
|
|
|
|
{
|
|
|
|
my ($charset, $direction) = @_;
|
|
|
|
|
2018-02-24 20:35:54 +01:00
|
|
|
croak "unacceptable direction : $direction"
|
2017-03-13 19:46:39 +01:00
|
|
|
if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
|
|
|
|
|
|
|
|
my @combined;
|
|
|
|
foreach my $c (@$charset)
|
|
|
|
{
|
|
|
|
next if ($c->{direction} != $direction && $c->{direction} != BOTH);
|
|
|
|
|
|
|
|
if (defined $c->{ucs_second})
|
|
|
|
{
|
|
|
|
my $entry = {
|
|
|
|
utf8 => ucs2utf($c->{ucs}),
|
|
|
|
utf8_second => ucs2utf($c->{ucs_second}),
|
|
|
|
code => $c->{code},
|
|
|
|
comment => $c->{comment},
|
|
|
|
f => $c->{f},
|
2018-05-09 16:14:46 +02:00
|
|
|
l => $c->{l}
|
|
|
|
};
|
2017-03-13 19:46:39 +01:00
|
|
|
push @combined, $entry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return \@combined;
|
|
|
|
}
|
|
|
|
|
|
|
|
#######################################################################
|
|
|
|
# convert UCS-4 to UTF-8
|
|
|
|
#
|
|
|
|
sub ucs2utf
|
|
|
|
{
|
|
|
|
my ($ucs) = @_;
|
|
|
|
my $utf;
|
|
|
|
|
|
|
|
if ($ucs <= 0x007f)
|
|
|
|
{
|
|
|
|
$utf = $ucs;
|
|
|
|
}
|
|
|
|
elsif ($ucs > 0x007f && $ucs <= 0x07ff)
|
|
|
|
{
|
|
|
|
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
|
|
|
|
}
|
|
|
|
elsif ($ucs > 0x07ff && $ucs <= 0xffff)
|
|
|
|
{
|
|
|
|
$utf =
|
|
|
|
((($ucs >> 12) | 0xe0) << 16) |
|
|
|
|
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
$utf =
|
|
|
|
((($ucs >> 18) | 0xf0) << 24) |
|
|
|
|
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
|
|
|
|
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
|
|
|
|
}
|
2017-08-17 18:39:20 +02:00
|
|
|
return $utf;
|
2016-11-30 13:54:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
1;
|