#!/usr/bin/perl # # Generation of types and lookup tables for Daitch-Mokotoff soundex. # # Copyright (c) 2023-2024, PostgreSQL Global Development Group # # This module was originally sponsored by Finance Norway / # Trafikkforsikringsforeningen, and implemented by Dag Lem # use strict; use warnings FATAL => 'all'; die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1; my $output_file = $ARGV[0]; # Open the output file open my $OUTPUT, '>', $output_file or die "Could not open output file $output_file: $!\n"; # Parse code table and generate tree for letter transitions. my %codes; my $table = [ {}, [ [ "", "", "" ] ] ]; while () { chomp; my ($letters, $codes) = split(/\s+/); my @codes = map { [ split(/,/) ] } split(/\|/, $codes); my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes); my $val = join( ",\n", map { "\t{\n\t\t" . join(", ", map { "\"$_\"" } @$_) . "\n\t}" } @codes); $codes{$key} = $val; for my $letter (split(/,/, $letters)) { my $ref = $table->[0]; # Link each character to the next in the letter combination. my @c = split(//, $letter); my $last_c = pop(@c); for my $c (@c) { $ref->{$c} //= [ {}, undef ]; $ref->{$c}[0] //= {}; $ref = $ref->{$c}[0]; } # The sound code for the letter combination is stored at the last character. $ref->{$last_c}[1] = $key; } } close(DATA); print $OUTPUT <[0]; for my $key (sort keys %$h) { $ref = $h->{$key}; my $children = "NULL"; if (defined $ref->[0]) { $children = "letter_$letter$key"; hash2code($ref, "$letter$key"); } my $codes = $ref->[1] // "NULL"; push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}"); } print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n"; for (@letters) { print $OUTPUT "$_,\n"; } print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n"; print $OUTPUT "};\n"; } hash2code($table, ''); close $OUTPUT; # Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html # # The conversion from the coding chart to the table should be self # explanatory, but note the differences stated below. # # X = NC (not coded) # # The non-ASCII letters in the coding chart are coded with substitute # lowercase ASCII letters, which sort after the uppercase ASCII letters: # # Ą => a (use '[' for table lookup) # Ę => e (use '\\' for table lookup) # Ţ => t (use ']' for table lookup) # # The rule for "UE" does not correspond to the coding chart, however # it is used by all other known implementations, including the one at # https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey"). # # Note that the implementation assumes that vowels are assigned code # 0 or 1. "J" can be either a vowel or a consonant. # __DATA__ AI,AJ,AY 0,1,X AU 0,7,X a X,X,6|X,X,X A 0,X,X B 7,7,7 CHS 5,54,54 CH 5,5,5|4,4,4 CK 5,5,5|45,45,45 CZ,CS,CSZ,CZS 4,4,4 C 5,5,5|4,4,4 DRZ,DRS 4,4,4 DS,DSH,DSZ 4,4,4 DZ,DZH,DZS 4,4,4 D,DT 3,3,3 EI,EJ,EY 0,1,X EU 1,1,X e X,X,6|X,X,X E 0,X,X FB 7,7,7 F 7,7,7 G 5,5,5 H 5,5,X IA,IE,IO,IU 1,X,X I 0,X,X J 1,X,X|4,4,4 KS 5,54,54 KH 5,5,5 K 5,5,5 L 8,8,8 MN 66,66,66 M 6,6,6 NM 66,66,66 N 6,6,6 OI,OJ,OY 0,1,X O 0,X,X P,PF,PH 7,7,7 Q 5,5,5 RZ,RS 94,94,94|4,4,4 R 9,9,9 SCHTSCH,SCHTSH,SCHTCH 2,4,4 SCH 4,4,4 SHTCH,SHCH,SHTSH 2,4,4 SHT,SCHT,SCHD 2,43,43 SH 4,4,4 STCH,STSCH,SC 2,4,4 STRZ,STRS,STSH 2,4,4 ST 2,43,43 SZCZ,SZCS 2,4,4 SZT,SHD,SZD,SD 2,43,43 SZ 4,4,4 S 4,4,4 TCH,TTCH,TTSCH 4,4,4 TH 3,3,3 TRZ,TRS 4,4,4 TSCH,TSH 4,4,4 TS,TTS,TTSZ,TC 4,4,4 TZ,TTZ,TZS,TSZ 4,4,4 t 3,3,3|4,4,4 T 3,3,3 UI,UJ,UY,UE 0,1,X U 0,X,X V 7,7,7 W 7,7,7 X 5,54,54 Y 1,X,X ZDZ,ZDZH,ZHDZH 2,4,4 ZD,ZHD 2,43,43 ZH,ZS,ZSCH,ZSH 4,4,4 Z 4,4,4