#!/usr/bin/perl # # This script substracts all suffixes of all words in a specific column in a table # and generates output that can be loaded into a new table with the # psql '\copy' command. The new table should have the following structure: # # create table tab ( # string text, # id oid # ); # # Note that you cannot use 'copy' (the SQL-command) directly, because # there's no '\.' included at the end of the output. # # The output can be fed through the UNIX commands 'uniq' and 'sort' # to generate the smallest and sorted output to populate the fti-table. # # Example: # # fti.pl -u -d mydb -t mytable -c mycolumn,mycolumn2 -f myfile # sort -o myoutfile myfile # uniq myoutfile sorted-file # # psql -u mydb # # \copy my_fti_table from myfile # # create index fti_idx on my_fti_table (string,id); # # create function fti() returns trigger as # '/path/to/fti/file/fti.so' # language 'C'; # # create trigger my_fti_trigger after update or insert or delete # on mytable # for each row execute procedure fti(my_fti_table, mycolumn); # # Make sure you have an index on mytable(oid) to be able to do somewhat # efficient substring searches. #use lib '/usr/local/pgsql/lib/perl5/'; use lib '/mnt/web/guide/postgres/lib/perl5/site_perl'; use Pg; use Getopt::Std; $PGRES_EMPTY_QUERY = 0 ; $PGRES_COMMAND_OK = 1 ; $PGRES_TUPLES_OK = 2 ; $PGRES_COPY_OUT = 3 ; $PGRES_COPY_IN = 4 ; $PGRES_BAD_RESPONSE = 5 ; $PGRES_NONFATAL_ERROR = 6 ; $PGRES_FATAL_ERROR = 7 ; # the minimum length of word to include in the full text index $MIN_WORD_LENGTH = 2; # the minimum length of the substrings in the full text index $MIN_SUBSTRING_LENGTH = 2; $[ = 0; # make sure string offsets start at 0 sub break_up { my $string = pop @_; # convert strings to lower case $string = lc($string); @strings = split(/\W+/, $string); @subs = (); foreach $s (@strings) { $len = length($s); next if ($len <= $MIN_WORD_LENGTH); for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) { $tmp = substr($s, $i); push(@subs, $tmp); } } return @subs; } sub connect_db { my $dbname = shift @_; my $user = shift @_; my $passwd = shift @_; if (!defined($dbname) || $dbname eq "") { return 1; } $connect_string = "dbname=$dbname"; if ($user ne "") { if ($passwd eq "") { return 0; } $connect_string = "$connect_string user=$user password=$passwd ". "authtype=password"; } $PG_CONN = PQconnectdb($connect_string); if (PQstatus($PG_CONN)) { print STDERR "Couldn't make connection with database!\n"; print STDERR PQerrorMessage($PG_CONN), "\n"; return 0; } return 1; } sub quit_prog { close(OUT); unlink $opt_f; if (defined($PG_CONN)) { PQfinish($PG_CONN); } exit 1; } sub get_username { print "Username: "; chop($n = ); return $n;; } sub get_password { print "Password: "; system("stty -echo < /dev/tty"); chop($pwd = ); print "\n"; system("stty echo < /dev/tty"); return $pwd; } sub main { getopts('d:t:c:f:u'); if (!$opt_d || !$opt_t || !$opt_c || !$opt_f) { print STDERR "usage: $0 [-u] -d database -t table -c column[,column...] ". "-f output-file\n"; return 1; } @cols = split(/,/, $opt_c); if (defined($opt_u)) { $uname = get_username(); $pwd = get_password(); } else { $uname = ""; $pwd = ""; } $SIG{'INT'} = 'quit_prog'; if (!connect_db($opt_d, $uname, $pwd)) { print STDERR "Connecting to database failed!\n"; return 1; } if (!open(OUT, ">$opt_f")) { print STDERR "Couldnt' open file '$opt_f' for output!\n"; return 1; } PQexec($PG_CONN, "SET search_path = public"); PQexec($PG_CONN, "begin"); $query = "declare C cursor for select (\""; $query .= join("\" || ' ' || \"", @cols); $query .= "\") as string, oid from $opt_t"; $res = PQexec($PG_CONN, $query); if (!$res || (PQresultStatus($res) != $PGRES_COMMAND_OK)) { print STDERR "Error declaring cursor!\n"; print STDERR PQerrorMessage($PG_CONN), "\n"; PQfinish($PG_CONN); return 1; } PQclear($res); $query = "fetch in C"; while (($res = PQexec($PG_CONN, $query)) && (PQresultStatus($res) == $PGRES_TUPLES_OK) && (PQntuples($res) == 1)) { $col = PQgetvalue($res, 0, 0); $oid = PQgetvalue($res, 0, 1); @subs = break_up($col); foreach $i (@subs) { print OUT "$i\t$oid\n"; } } if (!$res || (PQresultStatus($res) != PGRES_TUPLES_OK)) { print STDERR "Error retrieving data from backend!\n"; print STDERR PQerrorMEssage($PG_CONN), "\n"; PQfinish($PG_CONN); return 1; } PQclear($res); PQfinish($PG_CONN); return 0; } exit main();