#!/usr/local/bin/perl # # Listtodict, from Word Finder for PalmOS # Kiran S. Kedlaya # January 30, 2001 # # Converts a sorted word list into a Word Finder database. # Usage: "listtodict dictname < list > out.pdb" # # At present, everything is converted to lowercase. This may be altered # in a future version of Word Finder. # # Note: the PDB generation is shamelessly stolen from a script by # Ka-Ping Yee. sub timegm { local($s, $m, $h, $d, $l, $y) = @_; $y += $y < 70 ? 2000 : 1900 if $y < 1900; $y = 1970 if $y < 1970; $l = $l < 0 ? 0 : $l > 11 ? 11 : $l; local($mt) = $monthtime{pack('C2', $y, $l)} || &monthtime($y, $l); return $mt + $d * 86400 + $h * 3600 + $m * 60 + $s; } sub ptime { return ($_[5] ? &timegm(@_) : ($_[0] || time())) + 2082844800; } # Stage 1: translate alphabetized word list into "compressed" list by # replacing matching initial segments with alpha-coded numbers. $old = ; chomp $old; if ($old =~ /^([A-Z])(.*)$/) { # $out = "^" . (lc $1) . $2; $out = lc $old; } else { $out = $old; } @words = ("a" . $out . "\n"); while ($new = ) { chomp $new; $new = lc $new; for ($i=0; $i 1) { @table = sort { $freqs{$a} <=> $freqs{$b}; } keys(%freqs); $l1 = $table[0]; $l2 = $table[1]; $sum = $freqs{$l1} + $freqs{$l2}; delete $freqs{$l1}; delete $freqs{$l2}; foreach $a (split //, $l1) { if ($a ne "\0") {$code{$a} = "0" . $code{$a}; } } foreach $a (split //, $l2) { if ($a ne "\0") {$code{$a} = "1" . $code{$a}; } } $freqs{"\0" . $l1 . $l2} = $sum; } # Eliminate codes longer than eight bits. $longest = 0; foreach $char (keys(%code)) { $temp = length($code{$char}); if (length($code{$char}) > $longest) { $savechar = $char; $longest = $temp; } } while ($longest > 8) { # Try splitting a codeword of length 7, then 6, etc. $longest = 0; foreach $char (keys(%code)) { $temp = length($code{$char}); if ($temp < 8 and $temp > $longest) { $savechar2 = $char; $longest = $temp; } } # Move the longest word up, and shorten its former neighbor. $temp = $code{$savechar}; if (substr($temp, -1) eq "1") { substr($temp, -1) = "0"; } else { substr($temp, -1) = "1"; } foreach $savechar3 (keys(%code)) { if ($code{$savechar3} eq $temp) { substr($code{$savechar3}, -1) = ""; } } $code{$savechar} = $code{$savechar2} . "0"; $code{$savechar2} .= "1"; $longest = 0; foreach $char (keys(%code)) { $temp = length($code{$char}); if (length($code{$char}) > $longest) { $savechar = $char; $longest = $temp; } } } #Rebuild freqs string. @newkeys = sort { $code{$a} cmp $code{$b} } keys(%code); $result = ""; $oldstr = ""; foreach $char (@newkeys) { while (length($oldstr) > 0) { if (substr($oldstr, -1) eq "0") { substr($oldstr, -1) = "1"; last; } else { substr($oldstr, -1) = ""; } } $result .= "\0" x (length($code{$char}) - length($oldstr)); $result .= $char; $oldstr = $code{$char}; } #print $result; # Stage 3: Write everything to a pdb file. @records = ($result); $out = ""; foreach $a (@words) { foreach $b (split //, $a) { $out .= $code{$b}; } if (length($out) >= 64000*8 or $a eq "a\n") { push @records, pack("B*", $out . $code{"\n"} . "00000000000000"); $out = ""; } } push @records, pack("B*", $out. $code{"\n"} . "0000000000000000"); $name = $ARGV[0]; $dbattr = 0; $dbver = 0; $crtime = &ptime(); $mdtime = &ptime(); $bktime = 0; $mdcount = 0; $aioff = 0; $sioff = 0; $type = "DATA"; $creator = "Grep"; $idseed = 0; $nsec = scalar(@records); $hdrsize = 78 + 8 * $nsec + 2; @offset = (0); for ($si = 1; $si < $nsec; $si++) { $offset[$si] = $offset[$si - 1] + length($records[$si-1]); } print pack('a32nnNNNNNNa4a4Nx4n', $name, $dbattr, $dbver, $crtime, $mdtime, $bktime, $mdcount, $aioff, $sioff, $type, $creator, $idseed, $nsec); for ($si = 0; $si < $nsec; $si++) { print pack('NN', $offset[$si] + $hdrsize, $si); } print "\x00\x00" . join('', @records);