postgresql/src/common/unicode/generate-unicode_normprops_table.pl

#!/usr/bin/perl
#
# Generate table of Unicode normalization "quick check" properties
# (see UAX #15).  Pass DerivedNormalizationProps.txt as argument.  The
# output is on stdout.
#
# Copyright (c) 2020-2023, PostgreSQL Global Development Group

use strict;
use warnings;

use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;

my %data;

print
  "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";

print <<EOS;
#include "common/unicode_norm.h"

/*
 * Normalization quick check entry for codepoint.  We use a bit field
 * here to save space.
 */
typedef struct
{
	unsigned int codepoint:21;
	signed int	quickcheck:4;	/* really UnicodeNormalizationQC */
} pg_unicode_normprops;

/* Typedef for hash function on quick check table */
typedef int (*qc_hash_func) (const void *key);

/* Information for quick check lookup with perfect hash function */
typedef struct
{
	const pg_unicode_normprops *normprops;
	qc_hash_func	hash;
	int		num_normprops;
} pg_unicode_norminfo;
EOS

foreach my $line (<ARGV>)
{
	chomp $line;
	$line =~ s/\s*#.*$//;
	next if $line eq '';
	my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
	next if $prop !~ /_QC/;

	my ($first, $last);
	if ($codepoint =~ /\.\./)
	{
		($first, $last) = split /\.\./, $codepoint;
	}
	else
	{
		$first = $last = $codepoint;
	}

	foreach my $cp (hex($first) .. hex($last))
	{
		$data{$prop}{$cp} = $value;
	}
}

# We create a separate array for each normalization form rather than,
# say, a two-dimensional array, because that array would be very
# sparse and would create unnecessary overhead especially for the NFC
# lookup.
foreach my $prop (sort keys %data)
{
	# Don't build the tables for the "D" forms because they are too
	# big.  See also unicode_is_normalized_quickcheck().
	next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";

	print "\n";
	print
	  "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";

	my %subdata = %{ $data{$prop} };
	my @cp_packed;
	foreach my $cp (sort { $a <=> $b } keys %subdata)
	{
		my $qc;
		if ($subdata{$cp} eq 'N')
		{
			$qc = 'UNICODE_NORM_QC_NO';
		}
		elsif ($subdata{$cp} eq 'M')
		{
			$qc = 'UNICODE_NORM_QC_MAYBE';
		}
		else
		{
			die;
		}
		printf "\t{0x%04X, %s},\n", $cp, $qc;

		# Save the bytes as a string in network order.
		push @cp_packed, pack('N', $cp);
	}

	print "};\n";

	# Emit the definition of the perfect hash function.
	my $funcname = $prop . '_hash_func';
	my $f        = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
		fixed_key_length => 4);
	printf "\n/* Perfect hash function for %s */", $prop;
	print "\nstatic $f\n";

	# Emit the structure that wraps the hash lookup information into
	# one variable.
	printf "/* Hash lookup information for %s */", $prop;
	printf "\nstatic const pg_unicode_norminfo ";
	printf "UnicodeNormInfo_%s = {\n", $prop;
	printf "\tUnicodeNormProps_%s,\n", $prop;
	printf "\t%s,\n",                  $funcname;
	printf "\t%d\n",                   scalar @cp_packed;
	printf "};\n";
}