SHOW:
|
|
- or go back to the newest paste.
1 | #!/usr/bin/perl | |
2 | ||
3 | #spellclean.pl | |
4 | #cleans the spell checking word list. | |
5 | #USAGE spellclean.pl FILE.dic | |
6 | ||
7 | ||
8 | my $filename=shift; #shift command line argument to variable filename | |
9 | my $filenamemod = $filename.'.cleaned'; #cleaned filename and extension | |
10 | my $filenamebad = $filename.'.bad'; #bad lines filename and extension | |
11 | my $linecount = 0; #whole file line counter | |
12 | my $removeline = 0; #delete line flag | |
13 | my @badlines = (); #delete line array | |
14 | my $modlinecount = 0; #line counter for the destination file. | |
15 | ||
16 | # open file for phrasing | |
17 | open SPELLFILE, $filename or die "error opening file $filename\n"; | |
18 | ||
19 | ||
20 | print "Here is the preview of the words removed, press [Enter] to continue: "; | |
21 | <STDIN>; #wait for input | |
22 | ||
23 | ||
24 | while ($line=<SPELLFILE>) { | |
25 | $linecount++; #increment linecount | |
26 | ||
27 | if ($line=~m/^\d.*/) { #remove any words beginning with numbers | |
28 | $removeline = 1; | |
29 | } | |
30 | elsif ($line=~m/(^[A-Z]{2}?)/) { #remove any CAPital words, like acronyms | |
31 | $removeline = 1; | |
32 | } | |
33 | elsif ($line=~m/.-/gm) { #remove any words with hyphens | |
34 | $removeline = 1; | |
35 | } | |
36 | elsif ($line=~ /.\./gm) { #remove any words with dots | |
37 | $removeline = 1; | |
38 | } | |
39 | elsif ($line=~ /^.{1,3}$/gm) { #remove any one and two letter words | |
40 | $removeline = 1; | |
41 | } | |
42 | elsif ($line=~ /^.{4,4}$/gm) { #remove any three letter words | |
43 | $removeline = 1; | |
44 | } | |
45 | elsif ($line=~ /^.{5,5}$/gm) { #remove any four letter words | |
46 | $removeline = 1; | |
47 | } | |
48 | elsif ($line=~ /^[A-Z].*$/gm) { #remove any Name style words | |
49 | $removeline = 1; | |
50 | } | |
51 | elsif ($line=~ /'.*$/gm) { #remove any words with apostrophy | |
52 | $removeline = 1; | |
53 | } | |
54 | ||
55 | # push lines number to the array, reset the removeline flag to 0 before looping back | |
56 | if ($removeline == 1) { | |
57 | push (@badlines, $linecount); | |
58 | print "Bad line $linecount -> $line"; | |
59 | $removeline = 0; | |
60 | } | |
61 | ||
62 | } | |
63 | close (SPELLFILE); # close the file | |
64 | ||
65 | ||
66 | print "Preview complete, press [Enter] to remove words from list or ^C to abort! "; | |
67 | <STDIN>; #wait for input | |
68 | ||
69 | ||
70 | #open the original file and a destination file. | |
71 | open SPELLFILE, $filename or die "error opening file $filename\n"; | |
72 | open SPELLFILEMOD, ">$filenamemod" or die "error creating file $filenamemod\n"; | |
73 | open SPELLFILEBAD, ">$filenamebad" or die "error creating file $filenamebad\n"; | |
74 | ||
75 | $linecount = 0; #reset line counter to zero. | |
76 | $removeline = 0; #reuse this variable. | |
77 | ||
78 | ||
79 | #checks if the line number of the original file matches the first element of the array [0], if it does then it is a badline | |
80 | #and it wont be printed to the destination file. The removeline variable is incremented for the next array element. | |
81 | ||
82 | while ($line=<SPELLFILE>) { | |
83 | $linecount++; | |
84 | if ($badlines[$removeline]==$linecount) { | |
85 | print "Removing line $linecount -> $line"; | |
86 | print SPELLFILEBAD "$line"; | |
87 | $removeline++; | |
88 | } | |
89 | else { | |
90 | print SPELLFILEMOD "$line"; | |
91 | $modlinecount++; | |
92 | } | |
93 | } | |
94 | #close the files | |
95 | close SPELLFILE; | |
96 | close SPELLFILEMOD; | |
97 | close SPELLFILEBAD; | |
98 | ||
99 | # summary | |
100 | - | print "Cleaned file is saved as: $filenamemod\n"; |
100 | + | |
101 | print "Total lines in the original file is: $linecount\n"; | |
102 | print "Total lines removed is: ".scalar(@badlines)."\n"; | |
103 | print "Cleaned file has: $modlinecount lines\n"; | |
104 | print "Cleaned file is saved as: $filenamemod\n"; | |
105 | print "Bad lines file is saved as: $filenamebad\n"; |