Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use strict; use warnings;
- use List::Util 'sum';
- #use Data::Dumper;
- use Getopt::Long qw(:config gnu_getopt);
- BEGIN {
- eval {
- require Math::Random::MT::Perl; Math::Random::MT::Perl->import('rand');
- };
- warn "Optional module Math::Random::MT::Perl not found.\n" if $@;
- }
- #constants
- my @options = qw(eng-1M eng-all eng-fiction eng-gb eng-us fre ger heb rus spa irish german-medical bulgarian catalan swedish brazilian canadian-english-insane manx italian ogerman portuguese polish gaelic finnish);
- my $n = 4;
- my $default_dataset = 'Eng1M';
- #data from loaded files
- my @loaded_data;
- #data after normalizing and combining datasets
- my %grams;
- my %freqs;
- #some command line options
- my $debug_mode;
- my $target_offset = -4; #needs testing;
- my $dont_normalize;
- sub pick(%) {
- my %f = @_;
- my @c = keys %f;
- my @w = map { $f{$_} } @c;
- my $r = int(rand(sum(@w)));
- for(0..$#w) {
- return $c[$_] if $r < $w[$_];
- $r -= $w[$_];
- }
- print "end of pick loop reached. returned $c[$#w]\n" if $debug_mode;
- return $c[$#w];
- }
- sub generate {
- my $target = pick %freqs + $target_offset;
- my $word = ' ' x ($n-1);
- my $c;
- do {
- my $len = (length $word) - ($n-1);
- my %ftable = %{$grams{substr($word, -$n+1, $n-1)}};
- $ftable{' '} *= 2**($len-$target);
- $c = pick %ftable ;
- $word .= $c;
- } while $c ne ' ';
- $word =~ s/\s//g;
- $word = "$word (Target: $target)" if $debug_mode;
- return $word;
- }
- sub help {
- print "Usage: words [-dhNo] [DATASETS...] [NUMBER_OF_WORDS]\n";
- print "default: $default_dataset\n";
- print 'valid datasets: ' . (join ' --', @options) . "\n";
- return 0;
- }
- sub handle_opt($) {
- my ($mod) = @_;
- return sub {
- my $name = "Data/$mod.pl"
- my $r;
- if ($r = do $n) {
- push @loaded_data, [$r];
- }
- else {
- warn "Couldn't parse $n: $@" if $@;
- warn "Invalid option $_[0]: couldn't load $name: $!" unless defined $r;
- warn "Invalid option $_[0]: couldn't run $n" unless $r;
- }
- }
- }
- sub main {
- #option handling
- my $help;
- GetOptions (
- d => \$debug_mode,
- 'h|help' => \$help,
- N => \$dont_normalize,
- 'o=s' => \$target_offset,
- map {
- my $opt=$_;
- s/(^|-)(.)/\u$2/g;
- $opt, handle_opt $_ } @options
- );
- return help if $help;
- #combine/normalize datasets
- my $r;
- if ($r = do "Data/$default_dataset.pl") {
- @loaded_data ||= [do "Data/$default_dataset.pl"];
- }
- else {
- die "Couldn't compile default datafile: $@" if $@;
- die "Couldn't load default datafile: $!" unless defined $r;
- die 'Blargh!';
- }
- for(@loaded_data) {
- my ($data, $fdata) = @$_;
- #normalize and combine character frequencies
- while( my ($key, $subhash) = each %$data ) {
- my $sum = $dont_normalize || sum(values %$subhash);
- while( my ($c, $v) = each %$subhash ) {
- $grams{$key}->{$c} += $v/$sum;
- }
- }
- #normalize and combine length histograms
- my $sum = $dont_normalize || sum(values %$fdata);
- while ( my ($len, $f) = each %$fdata ) {
- $freqs{$len} += $f/$sum;
- }
- }
- local $, = ' ';
- print (map {generate} 1..(int($ARGV[0])||1));
- print "\n";
- }
- main unless caller;
Add Comment
Please, Sign In to add comment