# # Copyright (c) 2001-2023, PostgreSQL Global Development Group # # src/backend/utils/mb/Unicode/convutils.pm package convutils; use strict; use warnings; use Carp; use Exporter 'import'; our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); # Constants used in the 'direction' field of the character maps use constant { NONE => 0, TO_UNICODE => 1, FROM_UNICODE => 2, BOTH => 3 }; ####################################################################### # read_source - common routine to read source file # # fname ; input file name # sub read_source { my ($fname) = @_; my @r; open(my $in, '<', $fname) || die("cannot open $fname"); while (<$in>) { next if (/^#/); chop; next if (/^$/); # Ignore empty lines next if (/^0x([0-9A-F]+)\s+(#.*)$/); # The Unicode source files have three columns # 1: The "foreign" code (in hex) # 2: Unicode code point (in hex) # 3: Unicode name if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/) { print STDERR "READ ERROR at line $. in $fname: $_\n"; exit; } my $out = { code => hex($1), ucs => hex($2), comment => $4, direction => BOTH, f => $fname, l => $. }; # Ignore pure ASCII mappings. PostgreSQL character conversion code # never even passes these to the conversion code. next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); push(@r, $out); } close($in); return \@r; } ################################################################## # print_conversion_tables - output mapping tables # # print_conversion_tables($this_script, $csname, \%charset) # # this_script - the name of the *caller script* of this feature # csname - character set name other than ucs # charset - ref to character set array # # Input character set array format: # # Each element in the character set array is a hash. Each hash has the following fields: # direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether) # ucs - Unicode code point # ucs_second - Second Unicode code point, if this is a "combined" character. # code - Byte sequence in the "other" character set, as an integer # comment - Text representation of the character # f - Source filename # l - Line number in source file # sub print_conversion_tables { my ($this_script, $csname, $charset) = @_; print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset); print_conversion_tables_direction($this_script, $csname, TO_UNICODE, $charset); return; } ############################################################################# # INTERNAL ROUTINES ####################################################################### # print_conversion_tables_direction - write the whole content of C source of radix tree # # print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth) # # this_script - the name of the *caller script* of this feature # csname - character set name other than ucs # direction - desired direction, TO_UNICODE or FROM_UNICODE # charset - ref to character set array # sub print_conversion_tables_direction { my ($this_script, $csname, $direction, $charset) = @_; my $fname; my $tblname; if ($direction == TO_UNICODE) { $fname = lc("${csname}_to_utf8.map"); $tblname = lc("${csname}_to_unicode_tree"); print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; } else { $fname = lc("utf8_to_${csname}.map"); $tblname = lc("${csname}_from_unicode_tree"); print "- Writing UTF8=>${csname} conversion table: $fname\n"; } open(my $out, '>', $fname) || die("cannot open $fname"); print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; print $out "/* This file is generated by $this_script */\n\n"; # Collect regular, non-combined, mappings, and create the radix tree from them. my $charmap = &make_charmap($out, $charset, $direction, 0); print_radix_table($out, $tblname, $charmap); # Collect combined characters, and create combined character table (if any) my $charmap_combined = &make_charmap_combined($charset, $direction); if (scalar @{$charmap_combined} > 0) { if ($direction == TO_UNICODE) { print_to_utf8_combined_map($out, $csname, $charmap_combined, 1); } else { print_from_utf8_combined_map($out, $csname, $charmap_combined, 1); } } close($out); return; } sub print_from_utf8_combined_map { my ($out, $charset, $table, $verbose) = @_; my $last_comment = ""; printf $out "\n/* Combined character map */\n"; printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[%d] = {", scalar(@$table); my $first = 1; foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) { print($out ",") if (!$first); $first = 0; print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf $out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code}; if ($verbose >= 2) { $last_comment = sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; return; } sub print_to_utf8_combined_map { my ($out, $charset, $table, $verbose) = @_; my $last_comment = ""; printf $out "\n/* Combined character map */\n"; printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[%d] = {", scalar(@$table); my $first = 1; foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) { print($out ",") if (!$first); $first = 0; print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); printf $out "\n {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second}; if ($verbose >= 2) { $last_comment = sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); } elsif ($verbose >= 1) { $last_comment = $i->{comment}; } } print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); print $out "\n};\n"; return; } ####################################################################### # print_radix_table(, , ) # # Input: A hash, mapping an input character to an output character. # # Constructs a radix tree from the hash, and prints it out as a C-struct. # sub print_radix_table { my ($out, $tblname, $c) = @_; ### ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each ### radix tree is represented as a nested hash, each hash indexed by ### input byte ### my %b1map; my %b2map; my %b3map; my %b4map; foreach my $in (keys %$c) { my $out = $c->{$in}; if ($in <= 0xff) { $b1map{$in} = $out; } elsif ($in <= 0xffff) { my $b1 = $in >> 8; my $b2 = $in & 0xff; $b2map{$b1}{$b2} = $out; } elsif ($in <= 0xffffff) { my $b1 = $in >> 16; my $b2 = ($in >> 8) & 0xff; my $b3 = $in & 0xff; $b3map{$b1}{$b2}{$b3} = $out; } elsif ($in <= 0xffffffff) { my $b1 = $in >> 24; my $b2 = ($in >> 16) & 0xff; my $b3 = ($in >> 8) & 0xff; my $b4 = $in & 0xff; $b4map{$b1}{$b2}{$b3}{$b4} = $out; } else { die sprintf("up to 4 byte code is supported: %x", $in); } } my @segments; ### ### Build a linear list of "segments", from the nested hashes. ### ### Each segment is a lookup table, keyed by the next byte in the input. ### The segments are written out physically to one big array in the final ### step, but logically, they form a radix tree. Or rather, four radix ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte ### inputs, and 4-byte inputs. ### ### Each segment is represented by a hash with following fields: ### ### comment => ### label =>