postgresql/contrib/fuzzystrmatch/daitch_mokotoff_header.pl

220 lines
4.9 KiB
Perl
Executable File

#!/usr/bin/perl
#
# Generation of types and lookup tables for Daitch-Mokotoff soundex.
#
# Copyright (c) 2023-2024, PostgreSQL Global Development Group
#
# This module was originally sponsored by Finance Norway /
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
#
use strict;
use warnings FATAL => 'all';
die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
my $output_file = $ARGV[0];
# Open the output file
open my $OUTPUT, '>', $output_file
or die "Could not open output file $output_file: $!\n";
# Parse code table and generate tree for letter transitions.
my %codes;
my $table = [ {}, [ [ "", "", "" ] ] ];
while (<DATA>)
{
chomp;
my ($letters, $codes) = split(/\s+/);
my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
my $val = join(
",\n",
map {
"\t{\n\t\t"
. join(", ", map { "\"$_\"" } @$_) . "\n\t}"
} @codes);
$codes{$key} = $val;
for my $letter (split(/,/, $letters))
{
my $ref = $table->[0];
# Link each character to the next in the letter combination.
my @c = split(//, $letter);
my $last_c = pop(@c);
for my $c (@c)
{
$ref->{$c} //= [ {}, undef ];
$ref->{$c}[0] //= {};
$ref = $ref->{$c}[0];
}
# The sound code for the letter combination is stored at the last character.
$ref->{$last_c}[1] = $key;
}
}
close(DATA);
print $OUTPUT <<EOF;
/*
* Constants and lookup tables for Daitch-Mokotoff Soundex
*
* Copyright (c) 2023-2024, PostgreSQL Global Development Group
*
* This file is generated by daitch_mokotoff_header.pl
*/
/* Coding chart table: Soundex codes */
typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */
typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */
/* Coding chart table: Letter in input sequence */
struct dm_letter
{
char letter; /* Present letter in sequence */
const struct dm_letter *letters; /* List of possible successive letters */
const dm_codes *codes; /* Code sequence(s) for complete sequence */
};
typedef struct dm_letter dm_letter;
/* Codes for letter sequence at start of name, before a vowel, and any other. */
EOF
for my $key (sort keys %codes)
{
print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
. $codes{$key}
. "\n};\n";
}
print $OUTPUT <<EOF;
/* Coding for alternative following letters in sequence. */
EOF
sub hash2code
{
my ($ref, $letter) = @_;
my @letters = ();
my $h = $ref->[0];
for my $key (sort keys %$h)
{
$ref = $h->{$key};
my $children = "NULL";
if (defined $ref->[0])
{
$children = "letter_$letter$key";
hash2code($ref, "$letter$key");
}
my $codes = $ref->[1] // "NULL";
push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
}
print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
for (@letters)
{
print $OUTPUT "$_,\n";
}
print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
print $OUTPUT "};\n";
}
hash2code($table, '');
close $OUTPUT;
# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
#
# The conversion from the coding chart to the table should be self
# explanatory, but note the differences stated below.
#
# X = NC (not coded)
#
# The non-ASCII letters in the coding chart are coded with substitute
# lowercase ASCII letters, which sort after the uppercase ASCII letters:
#
# Ą => a (use '[' for table lookup)
# Ę => e (use '\\' for table lookup)
# Ţ => t (use ']' for table lookup)
#
# The rule for "UE" does not correspond to the coding chart, however
# it is used by all other known implementations, including the one at
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
#
# Note that the implementation assumes that vowels are assigned code
# 0 or 1. "J" can be either a vowel or a consonant.
#
__DATA__
AI,AJ,AY 0,1,X
AU 0,7,X
a X,X,6|X,X,X
A 0,X,X
B 7,7,7
CHS 5,54,54
CH 5,5,5|4,4,4
CK 5,5,5|45,45,45
CZ,CS,CSZ,CZS 4,4,4
C 5,5,5|4,4,4
DRZ,DRS 4,4,4
DS,DSH,DSZ 4,4,4
DZ,DZH,DZS 4,4,4
D,DT 3,3,3
EI,EJ,EY 0,1,X
EU 1,1,X
e X,X,6|X,X,X
E 0,X,X
FB 7,7,7
F 7,7,7
G 5,5,5
H 5,5,X
IA,IE,IO,IU 1,X,X
I 0,X,X
J 1,X,X|4,4,4
KS 5,54,54
KH 5,5,5
K 5,5,5
L 8,8,8
MN 66,66,66
M 6,6,6
NM 66,66,66
N 6,6,6
OI,OJ,OY 0,1,X
O 0,X,X
P,PF,PH 7,7,7
Q 5,5,5
RZ,RS 94,94,94|4,4,4
R 9,9,9
SCHTSCH,SCHTSH,SCHTCH 2,4,4
SCH 4,4,4
SHTCH,SHCH,SHTSH 2,4,4
SHT,SCHT,SCHD 2,43,43
SH 4,4,4
STCH,STSCH,SC 2,4,4
STRZ,STRS,STSH 2,4,4
ST 2,43,43
SZCZ,SZCS 2,4,4
SZT,SHD,SZD,SD 2,43,43
SZ 4,4,4
S 4,4,4
TCH,TTCH,TTSCH 4,4,4
TH 3,3,3
TRZ,TRS 4,4,4
TSCH,TSH 4,4,4
TS,TTS,TTSZ,TC 4,4,4
TZ,TTZ,TZS,TSZ 4,4,4
t 3,3,3|4,4,4
T 3,3,3
UI,UJ,UY,UE 0,1,X
U 0,X,X
V 7,7,7
W 7,7,7
X 5,54,54
Y 1,X,X
ZDZ,ZDZH,ZHDZH 2,4,4
ZD,ZHD 2,43,43
ZH,ZS,ZSCH,ZSH 4,4,4
Z 4,4,4