#!/usr/bin/perl =head1 NAME huffman - Generate huffman encodings =head1 SYNOPSIS huffman -h huffman -man huffman -f FILENAME [-t TOKENIZER -o OUTPUT] FILENAME is the text file to process. TOKENIZER is the type of tokenizer regular expression to use on the text. OUTPUT is an optional file name to write the data to. Defaults to STDOUT. =head1 BUGS Uses pipes as key delimiters. Should either use hashes or encode pipes. Encoding pipes should save B of memory! =cut # A little anal-retention. use strict; use warnings; # Happily pushing frames onto the call stack since 1836! no warnings 'recursion'; # Use all the nice local letters. use locale; # Dumps more nicely than we would by hand. use Data::Dumper; # Handles command line options better than we do by hand. use Getopt::Long; # Docs to the rescue! use Pod::Usage; # Hash ribbon campain against # namespace pollution! my %config; # Name of text file to process $config{in_file} = ''; # Tokenizer type $config{type} = 'letters'; # I don't trust autovivification all the time. # Output file name $config{out_file} = undef; # Need help? $config{help} = undef; $config{man} = undef; GetOptions( 'help' => \$config{help}, # Set help flag to 1 'man' => \$config{man}, # Set manual flag to 1 'file=s' => \$config{in_file}, # Get input file name 'tokenizer=s' => \$config{type}, # Get tokenizer type 'output=s' => \$config{out_file}, # Get output file name ); # Full man page pod2usage( -verbose => 2 ) if $config{man}; # Synopsis pod2usage( -verbose => 1 ) if $config{help}; # Generate usage information if the user # did not supply a valid file name. if ( not defined $config{in_file} or not -f $config{in_file} ) { pod2usage( -message => 'No input file name specified.', -exitval => 1, -verbose => 1, ); } # Generate usage information if the user # supplied an invalid output file name. if ( defined $config{out_file} and $config{out_file} !~ /\S/ ) { pod2usage( -message => 'Optional output file name invalid.', -exitval => 1, -verbose => 1, ); } # Allowed tokenizer types. # They will be run-time evaluated later. # Do that in Java! Tee hee hee! # If you have trouble understanding the code: # Note that "$line =~ /RE/g" generates a list of all RE matches. my %types = ( # Grab all words, ignore case, do so greedily! words => '$alphabet{lc($_)}++ foreach $line =~ /\w+/g;', # Grab all non-whitespace character sequences, greedily! strings => '$alphabet{$_}++ foreach $line =~ /\S+/g;', # Grab all non-whitespace characters. chars => '$alphabet{$_}++ foreach $line =~ /\S/g;', # Grab all word characters. letters => '$alphabet{lc($_)}++ foreach $line =~ /\w/g;', # Grab all pairs of characters, non-whitespace. cpairs => <<'HERE', # Get "Ha", "ll" of "Hallo" $alphabet{$_}++ foreach $line =~ /\S\S/g; substr $line, 0, 1, ''; # Get "al", "lo" of "Hallo" $alphabet{$_}++ foreach $line =~ /\S\S/g; HERE # Grab all pairs of word characters. wpairs => <<'HERE', # See the cpairs comments. $alphabet{lc($_)}++ foreach $line =~ /\w\w/g; substr $line, 0, 1, ''; $alphabet{lc($_)}++ foreach $line =~ /\w\w/g; HERE ); # Default tokenizer to "letters" $config{type} = 'letters' if not defined $config{type} or not exists $types{ $config{type} }; # Hash of tokens/occurrances my %alphabet; # Open input file open my $fh, '<', $config{in_file} or die "Could not open input file ($config{in_file}) for reading: $!"; # Process line by line. # We can do that because we use whitespace as # token delimiter anyway. This may save loads of memory. while (my $line = <$fh>) { # It is rather slow to recompile the # code every time, but it is convenient. eval $types{ $config{type} }; # Ooops!? Errors? die $@ if $@; } # Done reading. close $fh; # Invoke the huffman implementation. my($encoded, $tree) = huffman(\%alphabet); # Redirect STDOUT to a file of wanted. if ( defined $config{out_file} ) { open STDOUT, '>', $config{out_file} or die "Could not redirect STDOUT to file ($config{out_file}): $!"; } # For now, we just dump all the data we have. print Dumper \%alphabet; print Dumper $encoded; # Print the formatted tree. # Using Data::Dumper would yield a very deeply nested tree, # so we do it ourselves this time. Yuck! print @{ format_tree($tree) }; # format_tree # Takes a hash reference to a tree pseudo-object # as argument. # Returns a list of lines suitable for output # as a tree. # Calls itself recursively. sub format_tree { my $tree = shift; # Buffer. String concatenation is way slower, # so let perl handle the output buffering later. my @out; # Return token name if there are no further branches. if ( not defined $tree->{l} and not defined $tree->{o} ) { push @out, '"' . $tree->{k} . '"' . "\n"; return \@out; # Never ever pass long lists around! } # Is there an L branch? if ( defined $tree->{l} ) { push @out, "L{\n"; # Apply ourselves to a subset of the tree. my $ary = format_tree( $tree->{l} ); # Indent. Hey, Perl can do functional programming! push @out, map { ' ' . $_ } @$ary; push @out, " }\n"; } # O branch? if ( defined $tree->{o} ) { push @out, "O{\n"; # Apply ourselves to a subset of the tree. my $ary = format_tree( $tree->{o} ); # Indent. Hey, Perl can do functional programming! push @out, map { ' ' . $_ } @$ary; push @out, " }\n"; } return \@out; } # huffman # Generate huffman encoding based on an alphabet # that is passed as a hash reference. # Returns a hashref of encoded tokens and # a reference to the generated tree. sub huffman { my $alpha = shift; # Generate our tree object my $tree = { # Apply to all tokens in the alphabet # Associate token name with a hash ref map { $_ => { l => undef, # no L branch yet o => undef, # no O branch either p => $alpha->{$_}, # p ^= probability k => $_, # k (key) ^= token name } } keys %$alpha }; # Recursively combine the two least probable elements # Iteration is way faster in Perl. Sue me. $tree = _recur_combine_two($tree); # After the recursive combination of the # two least probable elements, we only have one # top-level key left. (Well, it's a tree after all.) $tree = $tree->{ (keys %$tree)[0] }; # Hash of encoded strings my %enc; # One string for every token foreach my $token ( keys %$alpha ) { # _string_tree_walk as opposed to # _binary_tree_walk to be implemented later my $encoded = _string_tree_walk($tree, $token); $enc{$token} = $encoded; } return \%enc, $tree; } # _string_tree_walk # Takes a tree to walk and a token to encode # as arguments. # Returns a string of L's and O's. # Calls itself recursively and passes a third argument: # the encoded string. sub _string_tree_walk { my $tree = shift; my $token = shift; # The encoded string we have walked so far. my $enc = shift; # Initialize $enc = '' if not defined $enc; if ( # There is an L branch. defined $tree->{l} and # And its key contains our token. # RE parsing left as an exercise to the reader. $tree->{l}{k} =~ /(?:^|\|)\Q$token\E(?:\||$)/ ) { # Recurse into the L branch. return _string_tree_walk( $tree->{l}, $token, $enc.'l' ); } elsif ( # There is an 0 branch. defined $tree->{o} and # And its key contains our token. $tree->{o}{k} =~ /(?:^|\|)\Q$token\E(?:\||$)/ ) { # Recurse into the O branch. return _string_tree_walk( $tree->{o}, $token, $enc.'o' ); } else { # No branches matching! Done encoding, return the result. return $enc; } # Sanity? die "Heck, you should never reach this!"; } # _recur_combine_two # Takes a tree structure as argument and modifies # it by combining the two least probable branches # into one. sub _recur_combine_two { # Tree object my $obj = shift; # Done if we have only one branch left. keys %$obj > 1 or return $obj; # Get the names of the two least probable branches my @combine = _lowest_p( $obj, 2 ); # Generate new key joining the old ones with a pipe # !!!FIXME!!! # Reverse to get the more probable first. # May make the RE's faster. my $new_key = join '|', reverse @combine; # my $new_tree = { # combined probability p => $obj->{$combine[0]}{p} + $obj->{$combine[1]}{p}, # The more probable branch l => $obj->{$combine[1]}, # The less probable branch o => $obj->{$combine[0]}, # The new, combined key k => $new_key, }; # Replace nodes $obj->{$new_key} = $new_tree; delete $obj->{$combine[0]}; delete $obj->{$combine[1]}; # Return ourselves. return _recur_combine_two($obj); } # _lowest_p # Takes a tree object and a number of elements to # extract as arguments. # Returns the names of the the tree branches with the # least probability. sub _lowest_p { my $hash = shift; my $no = shift || 1; # I know this can be done in linear time, but I # am too lazy to be bothered to do it. return( (sort { $hash->{$a}{p} <=> $hash->{$b}{p} } keys %$hash )[0..$no-1] ); }