107 lines
2.8 KiB
Perl
107 lines
2.8 KiB
Perl
#!/usr/bin/perl
|
|
#
|
|
# Read Unicode consortium's normalization test suite, NormalizationTest.txt,
|
|
# and generate a C array from it, for norm_test.c.
|
|
#
|
|
# NormalizationTest.txt is part of the Unicode Character Database.
|
|
#
|
|
# Copyright (c) 2000-2024, PostgreSQL Global Development Group
|
|
|
|
use strict;
|
|
use warnings FATAL => 'all';
|
|
|
|
use File::Basename;
|
|
|
|
die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2;
|
|
my $input_file = $ARGV[0];
|
|
my $output_file = $ARGV[1];
|
|
my $output_base = basename($output_file);
|
|
|
|
# Open the input and output files
|
|
open my $INPUT, '<', $input_file
|
|
or die "Could not open input file $input_file: $!";
|
|
open my $OUTPUT, '>', $output_file
|
|
or die "Could not open output file $output_file: $!\n";
|
|
|
|
# Print header of output file.
|
|
print $OUTPUT <<HEADER;
|
|
/*-------------------------------------------------------------------------
|
|
*
|
|
* norm_test_table.h
|
|
* Test strings for Unicode normalization.
|
|
*
|
|
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/common/unicode/norm_test_table.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
/*
|
|
* File auto-generated by src/common/unicode/generate-norm_test_table.pl, do
|
|
* not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H
|
|
* here.
|
|
*/
|
|
|
|
typedef struct
|
|
{
|
|
int linenum;
|
|
pg_wchar input[50];
|
|
pg_wchar output[4][50];
|
|
} pg_unicode_test;
|
|
|
|
/* test table */
|
|
HEADER
|
|
print $OUTPUT
|
|
"static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n";
|
|
|
|
# Helper routine to convert a space-separated list of Unicode characters to
|
|
# hexadecimal list format, suitable for outputting in a C array.
|
|
sub codepoint_string_to_hex
|
|
{
|
|
my $codepoint_string = shift;
|
|
|
|
my $result;
|
|
|
|
foreach (split(' ', $codepoint_string))
|
|
{
|
|
my $cp = $_;
|
|
my $utf8 = "0x$cp, ";
|
|
$result .= $utf8;
|
|
}
|
|
$result .= '0'; # null-terminated the array
|
|
return $result;
|
|
}
|
|
|
|
# Process the input file line by line
|
|
my $linenum = 0;
|
|
while (my $line = <$INPUT>)
|
|
{
|
|
$linenum = $linenum + 1;
|
|
if ($line =~ /^\s*#/) { next; } # ignore comments
|
|
|
|
if ($line =~ /^@/) { next; } # ignore @Part0 like headers
|
|
|
|
# Split the line wanted and get the fields needed:
|
|
#
|
|
# source; NFC; NFD; NFKC; NFKD
|
|
my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
|
|
|
|
my $source_utf8 = codepoint_string_to_hex($source);
|
|
my $nfc_utf8 = codepoint_string_to_hex($nfc);
|
|
my $nfd_utf8 = codepoint_string_to_hex($nfd);
|
|
my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
|
|
my $nfkd_utf8 = codepoint_string_to_hex($nfkd);
|
|
|
|
print $OUTPUT
|
|
"\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
|
|
}
|
|
|
|
# Output terminator entry
|
|
print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
|
|
print $OUTPUT "\n};\n";
|
|
|
|
close $OUTPUT;
|
|
close $INPUT;
|