#!/usr/bin/perl use strict; use Getopt::Std; use locale; my %opt; getopts('l:he:s:ap:o:m:f', \%opt); if ( $opt{h} || ! ($opt{e}||$opt{s}) || !$opt{l} ) { print< Usage: $0 -l LOCALENAME [ -e FILENAME ] [ -s FILENAME ] [ -p PREFIX ] [ -o FILENAME ] [ -a ] [ -m NUMBER ] -e FILENAME - file with endings of word -s FILENAME - file with list of stop-word -o FILENAME - out file, default STDOUT -a - stop-word are strimmed -p PREFIX - prefix of function and etc, default strimmed locale -m NUMBER - minimal length of rest after semming, default 3 -l LOCALENAME - name of locale -f - do not call tolower for each char At least one of -e or -s must be defined EOT exit; } if ( ! defined $opt{p} ) { $opt{p} = $opt{l}; $opt{p}=~s/[^a-zA-Z0-9_]+//g; } $opt{m}=3 if ! defined $opt{m}; my ($enddata,$stopdata) = ('',''); my $maxchild = 0; if ( $opt{e} ) { my @tree; buildtree(\@tree, $opt{e}, 1); printstruct( \@tree, 0, \$enddata); undef @tree; } if ( $opt{s} ) { my @tree; buildtree(\@tree, $opt{s}, 0); printstruct( \@tree, 0, \$stopdata); undef @tree; } die "No data\n" if ( ! (length $enddata || length $stopdata) ); $enddata = "\t{0,0,0,0}" if ( ! length $enddata ); $stopdata = "\t{0,0,0,0}" if ( ! length $stopdata ); my $fh=\*STDOUT; if ( $opt{o} ) { open(OUT,">$opt{o}") || die "Can;t open file '$opt{o}' for writing\n"; $fh = \*OUT; } my $linktype = 'uint32'; if ( $maxchild <= 0xff ) { $linktype='uint8'; } elsif ( $maxchild <= 0xffff ) { $linktype='uint16'; } my $wherecheck = ( $opt{a} ) ? "NULL,\n\t$opt{p}_is_stopword" : "$opt{p}_is_stopword,\n\tNULL"; my ($tolower, $resttolower) = ('',''); if ( ! $opt{f} ) { $tolower = '*cur = tolower( *cur );'; $resttolower=<= 0 ) { *cur = tolower(*cur); cur--; } EOT } print {$fh} < typedef struct { uint8 val; uint8 flag; uint8 right; $linktype child; } $opt{p}_NODE; /* is exists left tree ? */ #define L 0x01 /* finish word flag */ #define F 0x02 #define ISLEFT(x) ((($opt{p}_NODE*)x)->flag & L) #define ISFINISH(x) ((($opt{p}_NODE*)x)->flag & F) #define MINLENREST $opt{m} static $opt{p}_NODE $opt{p}_endstree[]={ $enddata }; static $opt{p}_NODE $opt{p}_stoptree[]={ $stopdata }; static char* $opt{p}_stem( void* obj, char *in, int *len ) { $opt{p}_NODE *ptr = $opt{p}_endstree; int result = 0; uint8 *buf = (uint8 *)in; uint8 *cur = buf + (*len) - 1; while( cur - buf >= MINLENREST ) { $tolower if ( ptr->val == *cur ) { if ( ISFINISH(ptr) ) result = buf + (*len) - cur; cur--; if ( ! ptr->child ) break; ptr += ptr->child; } else if ( ptr->val > *cur ) { if ( ISLEFT(ptr) ) ptr++; else break; } else { if ( ptr->right ) ptr += ptr->right; else break; } } $resttolower *len -= result; return in; } static int $opt{p}_is_stopword( void *obj, char *in, int len ) { $opt{p}_NODE *ptr = $opt{p}_stoptree; int result = 0; uint8 *buf = (uint8 *)in; uint8 *cur = buf; while( cur - buf < len ) { $tolower if ( ptr->val == *cur ) { cur++; if ( ISFINISH(ptr) ) result = cur - buf; if ( ! ptr->child ) break; ptr += ptr->child; } else if ( ptr->val > *cur ) { if ( ISLEFT(ptr) ) ptr++; else break; } else { if ( ptr->right ) ptr += ptr->right; else break; } } return (result==len) ? 1 : 0; } #undef L #undef F #undef ISLEFT #undef ISFINISH #undef MINLENREST #endif /* DICT_BODY */ #ifdef DICT_TABLE TABLE_DICT_START \"$opt{l}\", NULL, NULL, $opt{p}_stem, $wherecheck TABLE_DICT_END #endif EOT close($fh) if ( $fh != \*STDOUT ); sub buildtree { my ($reftree,$file, $needreverse) = @_; open(DATA,$file) || die "Can't open file '$file'\n"; while() { chomp; next if ! length $_; $_ = lc($_) if ! $opt{f}; addtostruct( $reftree, ( $needreverse ) ? scalar(reverse($_)) : $_ ); } close DATA; } sub mkbintree { my ( $start, $stop, $rprop, $rres) = @_; my $middle = $start + int( ($stop-$start)/2 ); push( @$rres, $rprop->[$middle] ); my $idx = $#$rres; $rres->[$idx]{right}=0; $rres->[$idx]{left}=0; return 1 if ( $start == $stop ); my $leftsize = 0; if ( $middle!=$start ) { $rres->[$idx]{left}=1; $leftsize = mkbintree( $start, $middle-1, $rprop, $rres ); $rres->[$idx]{right}=$leftsize+1; } else { $rres->[$idx]{right} = 1; } return 1 + $leftsize + mkbintree( $middle+1, $stop, $rprop, $rres ); } sub addtostruct { my $node = shift; my ($char, $subval) = split('', shift, 2); $char = ord( $char ); if ( ! defined $node->[$char] ) { $node->[$char] = {}; $node->[$char]{finish} = length $subval; $node->[$char]{child} = []; } elsif ( ! length $subval ) { $node->[$char]{finish} = 0; } addtostruct( $node->[$char]{child}, $subval ) if ( length $subval ); } sub printstruct { my ($node, $pre, $refout) = @_; my $add = 0; my @prop; my $outchild; my $current = 0; my $poschild=0; my @tmp; foreach my $i ( 0..255 ) { next if ( !defined $node->[ $i ] ); push @prop , { val=>$i, nchild=>printstruct( $node->[ $i ]{child}, 1, \$outchild ), poschild=>$poschild }; $poschild += $prop[$#prop]{nchild}; } return 0 if $#prop < 0; if ($pre) { $$refout .= ",\n\n"; } mkbintree(0,$#prop,\@prop,\@tmp); @prop = @tmp; $current=$#prop+1; foreach my $i ( 0..$#prop ) { my $flag = ($prop[$i]{left}) ? 'L' : undef; if ( $node->[ $prop[$i]{val} ]{finish}==0 ) { $flag .= '|' if defined $flag; $flag .= 'F'; } elsif ( ! defined $flag ) { $flag='0'; } $$refout .= "\t{'".chr( $prop[$i]{val} )."',". $flag.','. $prop[$i]{right}.','. (($prop[$i]{nchild}==0)?0:($prop[$i]{poschild}+$current)).'}'. (($i==$#prop)? '' : ",\n"); $maxchild = $prop[$i]{poschild}+$current if ( $prop[$i]{nchild} && $prop[$i]{poschild}+$current > $maxchild ); $current--; $add += $prop[$i]{nchild}; } $$refout .= $outchild; return $#prop+1 + $add; }