Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use strict;
- use Tie::RDBM;
- use Digest::MD5 qw/md5_hex/;
- use Data::Dumper;
- sub processSymbol;
- sub simpKey;
- # Sentinels. Use them as special values.
- my $leader = "l"; # Prior to first word in a statement.
- my $ender = "e"; # After last word in a statement.
- my %symbols;
- tie %symbols , 'Tie::RDBM', {db => 'dbi:mysql:databasename',
- create=> 1,
- user=> 'username',
- password => 'password',
- autocommit => 0 };
- print "Tied. Found " . scalar( keys( %symbols)) . " keys\n";
- $symbols{&simpKey($leader)} = 0
- unless exists $symbols{&simpKey($leader)};
- while(<>)
- {
- chomp;
- my @statements = split /\./;
- foreach( @statements )
- {
- my @statementSymbols = split /\s/;
- my $presentSymbol = $leader;
- my $nextSymbol;
- foreach(@statementSymbols)
- {
- $nextSymbol = "x$_";
- &processSymbol($presentSymbol, $nextSymbol);
- # Slip into the future.
- $presentSymbol = $nextSymbol;
- }
- # Terminate the last symbol.
- $nextSymbol = $ender;
- &processSymbol($presentSymbol, $nextSymbol);
- }
- # Commit our changes for this line
- (tied %symbols)->commit();
- }
- sub simpKey
- {
- my $key = shift;
- # Hash it. The TIE::RDBMS module throws up on something we're feeding it.
- $key = md5_hex($key);
- # Harder!
- # Eight hex chars works to 32 bits, or about 4 billion symbols.
- # Not *too* bad...
- $key = substr $key, 0, 8;
- return $key;
- }
- sub processSymbol
- {
- my $presentSymbol = &simpKey(shift);
- my $nextSymbol = shift;
- # Make sure the present symbol is known.
- my $presentBucket = {$nextSymbol => 0};
- # 1-2 fetches
- if(exists $symbols{$presentSymbol})
- {
- my $foundBucket = $symbols{$presentSymbol};
- $presentBucket = $foundBucket
- if $foundBucket ne "0";
- }
- $presentBucket->{$nextSymbol} += 1;
- # Save back to the hash.
- $symbols{$presentSymbol} = $presentBucket;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement