Advertisement
Guest User

spellclean.pl

a guest
Aug 18th, 2013
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.91 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. #spellclean.pl
  4. #cleans the spell checking word list.
  5. #USAGE spellclean.pl FILE.dic
  6.  
  7.  
  8. my $filename=shift;  #shift command line argument to variable filename
  9. my $filenamemod = $filename.'.cleaned';  #cleaned filename and extension
  10. my $linecount = 0;  #whole file line counter
  11. my $removeline = 0;  #delete line flag
  12. my @badlines = ();  #delete line array
  13. my $modlinecount = 0;   #line counter for the destination file.
  14.  
  15. # open file for phrasing
  16. open SPELLFILE, $filename or die "error opening file $filename\n";
  17.  
  18.  
  19. print "Here is the preview of the words removed, press [Enter] to continue: ";
  20. <STDIN>;  #wait for input
  21.  
  22.  
  23. while ($line=<SPELLFILE>) {
  24.     $linecount++;  #increment linecount
  25.  
  26.     if ($line=~m/^\d.*/) {  #remove any words beginning with numbers
  27.         $removeline = 1;
  28.     }
  29.     elsif ($line=~m/(^[A-Z]{2}?)/) {    #remove any CAPital words, like acronyms
  30.         $removeline = 1;
  31.     }
  32.     elsif ($line=~m/.-/gm) {    #remove any words with hyphens
  33.         $removeline = 1;
  34.     }
  35.     elsif ($line=~ /.\./gm) {   #remove any words with dots
  36.         $removeline = 1;
  37.     }
  38.     elsif ($line=~ /^.{1,3}$/gm) {  #remove any one and two letter words
  39.         $removeline = 1;
  40.     }
  41.     elsif ($line=~ /^.{4,4}$/gm) {  #remove any three letter words
  42.         $removeline = 1;
  43.     }
  44.     elsif ($line=~ /^.{5,5}$/gm) {  #remove any four letter words
  45.         $removeline = 1;
  46.     }
  47.     elsif ($line=~ /^[A-Z].*$/gm) { #remove any Name style words
  48.         $removeline = 1;
  49.     }
  50.     elsif ($line=~ /'.*$/gm) {  #remove any words with apostrophy
  51.         $removeline = 1;
  52.     }
  53.    
  54.     # push lines number to the array, reset the removeline flag to 0 before looping back
  55.     if ($removeline == 1) {
  56.         push (@badlines, $linecount);
  57.         print "Bad line $linecount -> $line";
  58.         $removeline = 0;
  59.     }
  60.  
  61. }
  62. close (SPELLFILE); # close the file
  63.  
  64.  
  65. print "Preview complete, press [Enter] to remove words from list or ^C to abort! ";
  66. <STDIN>;   #wait for input
  67.  
  68.  
  69. #open the original file and a destination file.
  70. open SPELLFILE, $filename or die "error opening file $filename\n";
  71. open SPELLFILEMOD, ">$filenamemod" or die "error creating file $filenamemod\n";
  72.  
  73. $linecount = 0;  #reset line counter to zero.
  74. $removeline = 0; #reuse this variable.
  75.  
  76.  
  77. #checks if the line number of the original file matches the first element of the array [0], if it does then it is a badline
  78. #and it wont be printed to the destination file.  The removeline variable is incremented for the next array element.
  79.  
  80. while ($line=<SPELLFILE>) {
  81.     $linecount++;
  82.     if ($badlines[$removeline]==$linecount) {
  83.         print "Removing line $linecount -> $line";
  84.         $removeline++;
  85.     }
  86.     else {
  87.          print SPELLFILEMOD "$line";
  88.          $modlinecount++;
  89.      }
  90. }
  91. #close the files
  92. close SPELLFILE;
  93. close SPELLFILEMOD;
  94.  
  95. # summary
  96. print "\n\nCompleted!\n";
  97. print "Total lines in the original file is: $linecount\n";
  98. print "Total lines removed is: ".scalar(@badlines)."\n";
  99. print "Cleaned file has: $modlinecount lines\n";
  100. print "Cleaned file is saved as: $filenamemod\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement