Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use Tie::RDBM;
- use Digest::MD5 qw/md5_hex/;
- sub processSymbol;
- sub simpKey;
- # Sentinels. Use them as special values.
- my $leader = "l"; # Prior to first word in a statement.
- my $ender = "e"; # After last word in a statement.
- my %symbols;
- tie %symbols , 'Tie::RDBM', {db => 'dbi:mysql:databasename',
- create=> 1,
- user=> 'username',
- password => 'password',
- autocommit => 0 };
- $symbols{&simpKey($leader)} = 0
- unless exists $symbols{&simpKey($leader)};
- while(<>)
- {
- chomp;
- my @statements = split /\./;
- foreach( @statements )
- {
- my @statementSymbols = split /\s/;
- my $presentSymbol = $leader;
- foreach(@statementSymbols)
- {
- my $nextSymbol = "x$_";
- &processSymbol($presentSymbol, $nextSymbol);
- # Slip into the future.
- $presentSymbol = $nextSymbol;
- }
- # Terminate the last symbol.
- $nextSymbol = $ender;
- &processSymbol($presentSymbol, $nextSymbol);
- }
- # Commit our changes for this line
- (tied %symbols)->commit();
- }
- sub simpKey
- {
- my $key = shift;
- # Hash it. The Tie::RDBM module throws up on something we're feeding it.
- $key = md5_hex($key);
- # Harder!
- # Eight hex chars works to 32 bits, or about 4 billion symbols.
- # Not *too* bad...
- $key = substr $key, 0, 8;
- return $key;
- }
- sub processSymbol
- {
- my $presentSymbol = &simpKey(shift);
- my $nextSymbol = shift;
- # Make sure the present symbol is known.
- unless(exists $symbols{$presentSymbol})
- {
- $symbols{$presentSymbol} = {$nextSymbol => 0};
- }
- $leaderSymbolBucket = $symbols{$presentSymbol};
- # Make sure the present-next link exists.
- unless(exists $leaderSymbolBucket->{$nextSymbol})
- {
- $leaderSymbolBucket->{$nextSymbol} = 0;
- }
- # Increment the count for the next symbol.
- ++$symbols{$presentSymbol}->{$nextSymbol};
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement