Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- #spellclean.pl
- #cleans the spell checking word list.
- #USAGE spellclean.pl FILE.dic
- my $filename=shift; #shift command line argument to variable filename
- my $filenamemod = $filename.'.cleaned'; #cleaned filename and extension
- my $filenamebad = $filename.'.bad'; #bad lines filename and extension
- my $linecount = 0; #whole file line counter
- my $removeline = 0; #delete line flag
- my @badlines = (); #delete line array
- my $modlinecount = 0; #line counter for the destination file.
- # open file for phrasing
- open SPELLFILE, $filename or die "error opening file $filename\n";
- print "Here is the preview of the words removed, press [Enter] to continue: ";
- <STDIN>; #wait for input
- while ($line=<SPELLFILE>) {
- $linecount++; #increment linecount
- if ($line=~m/^\d.*/) { #remove any words beginning with numbers
- $removeline = 1;
- }
- elsif ($line=~m/(^[A-Z]{2}?)/) { #remove any CAPital words, like acronyms
- $removeline = 1;
- }
- elsif ($line=~m/.-/gm) { #remove any words with hyphens
- $removeline = 1;
- }
- elsif ($line=~ /.\./gm) { #remove any words with dots
- $removeline = 1;
- }
- elsif ($line=~ /^.{1,3}$/gm) { #remove any one and two letter words
- $removeline = 1;
- }
- elsif ($line=~ /^.{4,4}$/gm) { #remove any three letter words
- $removeline = 1;
- }
- elsif ($line=~ /^.{5,5}$/gm) { #remove any four letter words
- $removeline = 1;
- }
- elsif ($line=~ /^[A-Z].*$/gm) { #remove any Name style words
- $removeline = 1;
- }
- elsif ($line=~ /'.*$/gm) { #remove any words with apostrophy
- $removeline = 1;
- }
- # push lines number to the array, reset the removeline flag to 0 before looping back
- if ($removeline == 1) {
- push (@badlines, $linecount);
- print "Bad line $linecount -> $line";
- $removeline = 0;
- }
- }
- close (SPELLFILE); # close the file
- print "Preview complete, press [Enter] to remove words from list or ^C to abort! ";
- <STDIN>; #wait for input
- #open the original file and a destination file.
- open SPELLFILE, $filename or die "error opening file $filename\n";
- open SPELLFILEMOD, ">$filenamemod" or die "error creating file $filenamemod\n";
- open SPELLFILEBAD, ">$filenamebad" or die "error creating file $filenamebad\n";
- $linecount = 0; #reset line counter to zero.
- $removeline = 0; #reuse this variable.
- #checks if the line number of the original file matches the first element of the array [0], if it does then it is a badline
- #and it wont be printed to the destination file. The removeline variable is incremented for the next array element.
- while ($line=<SPELLFILE>) {
- $linecount++;
- if ($badlines[$removeline]==$linecount) {
- print "Removing line $linecount -> $line";
- print SPELLFILEBAD "$line";
- $removeline++;
- }
- else {
- print SPELLFILEMOD "$line";
- $modlinecount++;
- }
- }
- #close the files
- close SPELLFILE;
- close SPELLFILEMOD;
- close SPELLFILEBAD;
- # summary
- print "\n\nCompleted!\n";
- print "Total lines in the original file is: $linecount\n";
- print "Total lines removed is: ".scalar(@badlines)."\n";
- print "Cleaned file has: $modlinecount lines\n";
- print "Cleaned file is saved as: $filenamemod\n";
- print "Bad lines file is saved as: $filenamebad\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement