Advertisement
Guest User

Untitled

a guest
May 20th, 2017
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 4.11 KB | None | 0 0
  1. #!/usr/bin/perl -w
  2.  
  3. use strict;
  4.  
  5. use Tie::RDBM;
  6.  
  7. sub processSymbol;
  8. sub simpKey;
  9. sub commitTie;
  10.  
  11. # Sentinels. Use them as special values.
  12. my $leader = "b"; # Prior to first word in a statement.
  13. my $ender = "c"; # After last word in a statement.
  14.  
  15.  
  16. my %symbols;
  17. tie %symbols , 'Tie::RDBM', {db => 'dbi:mysql:databasename',
  18.                 create=> 0,
  19.                 user=> 'databaseuser',
  20.                 password => 'password',
  21.                 autocommit => 0 };
  22.  
  23. print "Tied. Found " . scalar( keys( %symbols)) . " symbols\n";
  24.  
  25. $symbols{&simpKey($leader)} = 0
  26.     unless exists $symbols{&simpKey($leader)};
  27.  
  28.  
  29. my $commitPulse = 10000; # Commit every N symbols.
  30.  
  31. my $linesCounted = 0;
  32. my $statementsCounted = 0;
  33. my $symbolsProcessed = 0;
  34. my $symbolsIntroduced = 0;
  35.  
  36. while(<>)
  37. {
  38.     chomp;
  39.     my @statements = split /\.\?\!/;
  40.  
  41.     foreach( @statements )
  42.     {
  43.         # Get our symbols, filter out empty statements.
  44.         my @statementSymbols = grep { 0 != length } split /\s/;
  45.  
  46.         my $presentSymbol = $leader;
  47.  
  48.         my $nextSymbol;
  49.  
  50.         foreach(@statementSymbols)
  51.         {
  52.             $nextSymbol = "t$_";
  53.  
  54.             &processSymbol($presentSymbol, $nextSymbol);
  55.             # Slip into the future.
  56.             $presentSymbol = $nextSymbol;
  57.         }
  58.  
  59.         # Terminate the last symbol.
  60.         $nextSymbol = $ender;
  61.  
  62.         &processSymbol($presentSymbol, $nextSymbol);
  63.         ++$statementsCounted;
  64.     }
  65.  
  66.     ++$linesCounted;
  67.  
  68. }
  69.  
  70. &commitTie();
  71.  
  72. print "Ended with " . scalar( keys( %symbols)) . " symbols\n";
  73.  
  74. sub commitTie
  75. {
  76.     # Commit our changes
  77.     (tied %symbols)->commit();
  78.     print "Committed. $linesCounted lines counted, $statementsCounted statements counted, $symbolsProcessed symbols processed, $symbolsIntroduced introduced\n";
  79.     $symbolsIntroduced = 0;
  80.     $symbolsProcessed = 0;
  81. }
  82.  
  83. sub phoneticShift
  84. {
  85.     # This is a multi-pass function. I suggest you keep calling it until you get back what you provided.
  86.     my $key = shift;
  87.  
  88.  
  89.     # Do some phonetic simplification.
  90.     # Any consonant that involves the vocal chords, remove the vocal.
  91.     # Also follow pun-downs, where possible.
  92.     # Admittedly, this is mostly going to derive from my own accent...
  93.     # b->p
  94.     # d->t
  95.     # v->f
  96.     # s->c
  97.     # z->s
  98.  
  99.     $key =~ tr/bdvsz/ptfcs/;
  100.  
  101.     # que->k ( cheque -> chek, plastique -> plastik )
  102.     $key =~ s/que/k/g;
  103.  
  104.     # qu->kw (quick -> kwick)
  105.     $key =~ s/qu/kw/g;
  106.  
  107.     # q->k
  108.     $key =~ s/q/k/g;
  109.  
  110.     # x->cks (linucks, for example.)
  111.     $key =~ s/x/cks/g;
  112.  
  113.     # g->ch (geepers -> cheepers) (Although I could go with g->k, for gargoyle -> karkoyle)
  114.     # j->ch (jaay -> chaaa) (solves g/j punning, too)
  115.     # k->ch (konk -> conch) (solves the geepers/gargoyle g dual-usage as well)
  116.     $key =~ s/[sjk]/ch/g;
  117.  
  118.     # Drop all the vowels. There's a lot of redundancy there, and we might even get a pun or two out of it.
  119.     $key =~ s/[aeiou]//g;
  120.  
  121.     return $key;
  122. }
  123.  
  124. sub simpKey
  125. {
  126.     my $key = shift;
  127.     my $origin = $key;
  128.  
  129.     # lower-case it. We don't honestly give rat's behind about upper-case vs lower-case, when looking things up.
  130.     $key = lc $key;
  131.  
  132.     # Drop anything that's not alphabetic
  133.     $key =~ s/[^a-z]//g;
  134.  
  135.     my $tmpKey = &phoneticShift($key);
  136.    
  137.     while($tmpKey ne $key)
  138.     {
  139.         $key = $tmpKey;
  140.         $tmpKey = &phoneticShift($key);
  141.     }
  142.  
  143.     # Take our final shifted form and treat that as our key.
  144.     $key = $tmpKey;
  145.  
  146.  
  147.     # Now truncate any symbol longer than 15 chars. (I don't think there's an English word that has that many consonants, even *after* our exanding things to cks and ch.)
  148.     $key = substr $key, 0, 15;
  149.  
  150.     return $key;
  151. }
  152.  
  153. sub processSymbol
  154. {
  155.     my $presentSymbol = &simpKey(shift);
  156.     my $nextSymbol = shift;
  157.  
  158.     # Need to do some sanitization of nextSymbol, for some sad reason. Tie::RDBM screws up, somehow.
  159.  
  160.     # Make sure the present symbol is known.
  161.     my $presentBucket = {$nextSymbol => 0};
  162.  
  163.     # 1-2 fetches
  164.     if(exists $symbols{$presentSymbol})
  165.     {
  166.         my $foundBucket = $symbols{$presentSymbol};
  167.         if ($foundBucket ne "0")
  168.         {
  169.             $presentBucket = $foundBucket;
  170.         }
  171.         else
  172.         {
  173.             ++$symbolsIntroduced;
  174.         }
  175.     }
  176.     else
  177.     {
  178.         ++$symbolsIntroduced;
  179.     }
  180.  
  181.     $presentBucket->{$nextSymbol} += 1;
  182.  
  183.     # Save back to the hash.
  184.     $symbols{$presentSymbol} = $presentBucket;
  185.  
  186.     ++$symbolsProcessed;
  187.  
  188.     if(0 == ( $symbolsProcessed % $commitPulse) )
  189.     {
  190.         &commitTie();
  191.     }
  192. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement