Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl -w
- use strict;
- use Tie::RDBM;
- sub processSymbol;
- sub simpKey;
- sub commitTie;
- # Sentinels. Use them as special values.
- my $leader = "b"; # Prior to first word in a statement.
- my $ender = "c"; # After last word in a statement.
- my %symbols;
- tie %symbols , 'Tie::RDBM', {db => 'dbi:mysql:databasename',
- create=> 0,
- user=> 'databaseuser',
- password => 'password',
- autocommit => 0 };
- print "Tied. Found " . scalar( keys( %symbols)) . " symbols\n";
- $symbols{&simpKey($leader)} = 0
- unless exists $symbols{&simpKey($leader)};
- my $commitPulse = 10000; # Commit every N symbols.
- my $linesCounted = 0;
- my $statementsCounted = 0;
- my $symbolsProcessed = 0;
- my $symbolsIntroduced = 0;
- while(<>)
- {
- chomp;
- my @statements = split /\.\?\!/;
- foreach( @statements )
- {
- # Get our symbols, filter out empty statements.
- my @statementSymbols = grep { 0 != length } split /\s/;
- my $presentSymbol = $leader;
- my $nextSymbol;
- foreach(@statementSymbols)
- {
- $nextSymbol = "t$_";
- &processSymbol($presentSymbol, $nextSymbol);
- # Slip into the future.
- $presentSymbol = $nextSymbol;
- }
- # Terminate the last symbol.
- $nextSymbol = $ender;
- &processSymbol($presentSymbol, $nextSymbol);
- ++$statementsCounted;
- }
- ++$linesCounted;
- }
- &commitTie();
- print "Ended with " . scalar( keys( %symbols)) . " symbols\n";
- sub commitTie
- {
- # Commit our changes
- (tied %symbols)->commit();
- print "Committed. $linesCounted lines counted, $statementsCounted statements counted, $symbolsProcessed symbols processed, $symbolsIntroduced introduced\n";
- $symbolsIntroduced = 0;
- $symbolsProcessed = 0;
- }
- sub phoneticShift
- {
- # This is a multi-pass function. I suggest you keep calling it until you get back what you provided.
- my $key = shift;
- # Do some phonetic simplification.
- # Any consonant that involves the vocal chords, remove the vocal.
- # Also follow pun-downs, where possible.
- # Admittedly, this is mostly going to derive from my own accent...
- # b->p
- # d->t
- # v->f
- # s->c
- # z->s
- $key =~ tr/bdvsz/ptfcs/;
- # que->k ( cheque -> chek, plastique -> plastik )
- $key =~ s/que/k/g;
- # qu->kw (quick -> kwick)
- $key =~ s/qu/kw/g;
- # q->k
- $key =~ s/q/k/g;
- # x->cks (linucks, for example.)
- $key =~ s/x/cks/g;
- # g->ch (geepers -> cheepers) (Although I could go with g->k, for gargoyle -> karkoyle)
- # j->ch (jaay -> chaaa) (solves g/j punning, too)
- # k->ch (konk -> conch) (solves the geepers/gargoyle g dual-usage as well)
- $key =~ s/[sjk]/ch/g;
- # Drop all the vowels. There's a lot of redundancy there, and we might even get a pun or two out of it.
- $key =~ s/[aeiou]//g;
- return $key;
- }
- sub simpKey
- {
- my $key = shift;
- my $origin = $key;
- # lower-case it. We don't honestly give rat's behind about upper-case vs lower-case, when looking things up.
- $key = lc $key;
- # Drop anything that's not alphabetic
- $key =~ s/[^a-z]//g;
- my $tmpKey = &phoneticShift($key);
- while($tmpKey ne $key)
- {
- $key = $tmpKey;
- $tmpKey = &phoneticShift($key);
- }
- # Take our final shifted form and treat that as our key.
- $key = $tmpKey;
- # Now truncate any symbol longer than 15 chars. (I don't think there's an English word that has that many consonants, even *after* our exanding things to cks and ch.)
- $key = substr $key, 0, 15;
- return $key;
- }
- sub processSymbol
- {
- my $presentSymbol = &simpKey(shift);
- my $nextSymbol = shift;
- # Need to do some sanitization of nextSymbol, for some sad reason. Tie::RDBM screws up, somehow.
- # Make sure the present symbol is known.
- my $presentBucket = {$nextSymbol => 0};
- # 1-2 fetches
- if(exists $symbols{$presentSymbol})
- {
- my $foundBucket = $symbols{$presentSymbol};
- if ($foundBucket ne "0")
- {
- $presentBucket = $foundBucket;
- }
- else
- {
- ++$symbolsIntroduced;
- }
- }
- else
- {
- ++$symbolsIntroduced;
- }
- $presentBucket->{$nextSymbol} += 1;
- # Save back to the hash.
- $symbols{$presentSymbol} = $presentBucket;
- ++$symbolsProcessed;
- if(0 == ( $symbolsProcessed % $commitPulse) )
- {
- &commitTie();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement