Advertisement
Guest User

spellclean.pl

a guest
Aug 18th, 2013
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/perl
  2.  
  3. #spellclean.pl
  4. #cleans the spell checking word list.
  5. #USAGE spellclean.pl FILE.dic
  6.  
  7.  
  8. my $filename=shift;  #shift command line argument to variable filename
  9. my $filenamemod = $filename.'.cleaned';  #cleaned filename and extension
  10. my $filenamebad = $filename.'.bad';  #bad lines filename and extension
  11. my $linecount = 0;  #whole file line counter
  12. my $removeline = 0;  #delete line flag
  13. my @badlines = ();  #delete line array
  14. my $modlinecount = 0;   #line counter for the destination file.
  15.  
  16. # open file for phrasing
  17. open SPELLFILE, $filename or die "error opening file $filename\n";
  18.  
  19.  
  20. print "Here is the preview of the words removed, press [Enter] to continue: ";
  21. <STDIN>;  #wait for input
  22.  
  23.  
  24. while ($line=<SPELLFILE>) {
  25.     $linecount++;  #increment linecount
  26.  
  27.     if ($line=~m/^\d.*/) {  #remove any words beginning with numbers
  28.         $removeline = 1;
  29.     }
  30.     elsif ($line=~m/(^[A-Z]{2}?)/) {    #remove any CAPital words, like acronyms
  31.         $removeline = 1;
  32.     }
  33.     elsif ($line=~m/.-/gm) {    #remove any words with hyphens
  34.         $removeline = 1;
  35.     }
  36.     elsif ($line=~ /.\./gm) {   #remove any words with dots
  37.         $removeline = 1;
  38.     }
  39.     elsif ($line=~ /^.{1,3}$/gm) {  #remove any one and two letter words
  40.         $removeline = 1;
  41.     }
  42.     elsif ($line=~ /^.{4,4}$/gm) {  #remove any three letter words
  43.         $removeline = 1;
  44.     }
  45.     elsif ($line=~ /^.{5,5}$/gm) {  #remove any four letter words
  46.         $removeline = 1;
  47.     }
  48.     elsif ($line=~ /^[A-Z].*$/gm) { #remove any Name style words
  49.         $removeline = 1;
  50.     }
  51.     elsif ($line=~ /'.*$/gm) {  #remove any words with apostrophy
  52.         $removeline = 1;
  53.     }
  54.    
  55.     # push lines number to the array, reset the removeline flag to 0 before looping back
  56.     if ($removeline == 1) {
  57.         push (@badlines, $linecount);
  58.         print "Bad line $linecount -> $line";
  59.         $removeline = 0;
  60.     }
  61.  
  62. }
  63. close (SPELLFILE); # close the file
  64.  
  65.  
  66. print "Preview complete, press [Enter] to remove words from list or ^C to abort! ";
  67. <STDIN>;   #wait for input
  68.  
  69.  
  70. #open the original file and a destination file.
  71. open SPELLFILE, $filename or die "error opening file $filename\n";
  72. open SPELLFILEMOD, ">$filenamemod" or die "error creating file $filenamemod\n";
  73. open SPELLFILEBAD, ">$filenamebad" or die "error creating file $filenamebad\n";
  74.  
  75. $linecount = 0;  #reset line counter to zero.
  76. $removeline = 0; #reuse this variable.
  77.  
  78.  
  79. #checks if the line number of the original file matches the first element of the array [0], if it does then it is a badline
  80. #and it wont be printed to the destination file.  The removeline variable is incremented for the next array element.
  81.  
  82. while ($line=<SPELLFILE>) {
  83.     $linecount++;
  84.     if ($badlines[$removeline]==$linecount) {
  85.         print "Removing line $linecount -> $line";
  86.         print SPELLFILEBAD "$line";
  87.         $removeline++;
  88.     }
  89.     else {
  90.          print SPELLFILEMOD "$line";
  91.          $modlinecount++;
  92.      }
  93. }
  94. #close the files
  95. close SPELLFILE;
  96. close SPELLFILEMOD;
  97. close SPELLFILEBAD;
  98.  
  99. # summary
  100. print "\n\nCompleted!\n";
  101. print "Total lines in the original file is: $linecount\n";
  102. print "Total lines removed is: ".scalar(@badlines)."\n";
  103. print "Cleaned file has: $modlinecount lines\n";
  104. print "Cleaned file is saved as: $filenamemod\n";
  105. print "Bad lines file is saved as: $filenamebad\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement