SHOW:
|
|
- or go back to the newest paste.
| 1 | #!/usr/bin/perl | |
| 2 | ||
| 3 | #spellclean.pl | |
| 4 | #cleans the spell checking word list. | |
| 5 | #USAGE spellclean.pl FILE.dic | |
| 6 | ||
| 7 | ||
| 8 | my $filename=shift; #shift command line argument to variable filename | |
| 9 | my $filenamemod = $filename.'.cleaned'; #cleaned filename and extension | |
| 10 | my $filenamebad = $filename.'.bad'; #bad lines filename and extension | |
| 11 | my $linecount = 0; #whole file line counter | |
| 12 | my $removeline = 0; #delete line flag | |
| 13 | my @badlines = (); #delete line array | |
| 14 | my $modlinecount = 0; #line counter for the destination file. | |
| 15 | ||
| 16 | # open file for phrasing | |
| 17 | open SPELLFILE, $filename or die "error opening file $filename\n"; | |
| 18 | ||
| 19 | ||
| 20 | print "Here is the preview of the words removed, press [Enter] to continue: "; | |
| 21 | <STDIN>; #wait for input | |
| 22 | ||
| 23 | ||
| 24 | while ($line=<SPELLFILE>) {
| |
| 25 | $linecount++; #increment linecount | |
| 26 | ||
| 27 | if ($line=~m/^\d.*/) { #remove any words beginning with numbers
| |
| 28 | $removeline = 1; | |
| 29 | } | |
| 30 | elsif ($line=~m/(^[A-Z]{2}?)/) { #remove any CAPital words, like acronyms
| |
| 31 | $removeline = 1; | |
| 32 | } | |
| 33 | elsif ($line=~m/.-/gm) { #remove any words with hyphens
| |
| 34 | $removeline = 1; | |
| 35 | } | |
| 36 | elsif ($line=~ /.\./gm) { #remove any words with dots
| |
| 37 | $removeline = 1; | |
| 38 | } | |
| 39 | elsif ($line=~ /^.{1,3}$/gm) { #remove any one and two letter words
| |
| 40 | $removeline = 1; | |
| 41 | } | |
| 42 | elsif ($line=~ /^.{4,4}$/gm) { #remove any three letter words
| |
| 43 | $removeline = 1; | |
| 44 | } | |
| 45 | elsif ($line=~ /^.{5,5}$/gm) { #remove any four letter words
| |
| 46 | $removeline = 1; | |
| 47 | } | |
| 48 | elsif ($line=~ /^[A-Z].*$/gm) { #remove any Name style words
| |
| 49 | $removeline = 1; | |
| 50 | } | |
| 51 | elsif ($line=~ /'.*$/gm) { #remove any words with apostrophy
| |
| 52 | $removeline = 1; | |
| 53 | } | |
| 54 | ||
| 55 | # push lines number to the array, reset the removeline flag to 0 before looping back | |
| 56 | if ($removeline == 1) {
| |
| 57 | push (@badlines, $linecount); | |
| 58 | print "Bad line $linecount -> $line"; | |
| 59 | $removeline = 0; | |
| 60 | } | |
| 61 | ||
| 62 | } | |
| 63 | close (SPELLFILE); # close the file | |
| 64 | ||
| 65 | ||
| 66 | print "Preview complete, press [Enter] to remove words from list or ^C to abort! "; | |
| 67 | <STDIN>; #wait for input | |
| 68 | ||
| 69 | ||
| 70 | #open the original file and a destination file. | |
| 71 | open SPELLFILE, $filename or die "error opening file $filename\n"; | |
| 72 | open SPELLFILEMOD, ">$filenamemod" or die "error creating file $filenamemod\n"; | |
| 73 | open SPELLFILEBAD, ">$filenamebad" or die "error creating file $filenamebad\n"; | |
| 74 | ||
| 75 | $linecount = 0; #reset line counter to zero. | |
| 76 | $removeline = 0; #reuse this variable. | |
| 77 | ||
| 78 | ||
| 79 | #checks if the line number of the original file matches the first element of the array [0], if it does then it is a badline | |
| 80 | #and it wont be printed to the destination file. The removeline variable is incremented for the next array element. | |
| 81 | ||
| 82 | while ($line=<SPELLFILE>) {
| |
| 83 | $linecount++; | |
| 84 | if ($badlines[$removeline]==$linecount) {
| |
| 85 | print "Removing line $linecount -> $line"; | |
| 86 | print SPELLFILEBAD "$line"; | |
| 87 | $removeline++; | |
| 88 | } | |
| 89 | else {
| |
| 90 | print SPELLFILEMOD "$line"; | |
| 91 | $modlinecount++; | |
| 92 | } | |
| 93 | } | |
| 94 | #close the files | |
| 95 | close SPELLFILE; | |
| 96 | close SPELLFILEMOD; | |
| 97 | close SPELLFILEBAD; | |
| 98 | ||
| 99 | # summary | |
| 100 | - | print "Cleaned file is saved as: $filenamemod\n"; |
| 100 | + | |
| 101 | print "Total lines in the original file is: $linecount\n"; | |
| 102 | print "Total lines removed is: ".scalar(@badlines)."\n"; | |
| 103 | print "Cleaned file has: $modlinecount lines\n"; | |
| 104 | print "Cleaned file is saved as: $filenamemod\n"; | |
| 105 | print "Bad lines file is saved as: $filenamebad\n"; |