View difference between Paste ID: PAa9m0Ud and 5pzE2bXy
SHOW: | | - or go back to the newest paste.
1
#!/usr/bin/perl
2
3
#spellclean.pl
4
#cleans the spell checking word list.
5
#USAGE spellclean.pl FILE.dic
6
7
8
my $filename=shift;  #shift command line argument to variable filename
9
my $filenamemod = $filename.'.cleaned';  #cleaned filename and extension
10
my $filenamebad = $filename.'.bad';  #bad lines filename and extension
11
my $linecount = 0;  #whole file line counter
12
my $removeline = 0;  #delete line flag
13
my @badlines = ();  #delete line array
14
my $modlinecount = 0;   #line counter for the destination file.
15
16
# open file for phrasing
17
open SPELLFILE, $filename or die "error opening file $filename\n";
18
19
20
print "Here is the preview of the words removed, press [Enter] to continue: ";
21
<STDIN>;  #wait for input
22
23
24
while ($line=<SPELLFILE>) {
25
	$linecount++;  #increment linecount
26
27
	if ($line=~m/^\d.*/) {	#remove any words beginning with numbers
28
		$removeline = 1;
29
	}
30
	elsif ($line=~m/(^[A-Z]{2}?)/) {	#remove any CAPital words, like acronyms
31
		$removeline = 1;
32
	}
33
	elsif ($line=~m/.-/gm) {	#remove any words with hyphens
34
		$removeline = 1;
35
	}
36
	elsif ($line=~ /.\./gm) {	#remove any words with dots
37
		$removeline = 1;
38
	}
39
	elsif ($line=~ /^.{1,3}$/gm) {	#remove any one and two letter words
40
		$removeline = 1;
41
	}
42
	elsif ($line=~ /^.{4,4}$/gm) {	#remove any three letter words
43
		$removeline = 1;
44
	}
45
	elsif ($line=~ /^.{5,5}$/gm) {	#remove any four letter words
46
		$removeline = 1;
47
	}
48
	elsif ($line=~ /^[A-Z].*$/gm) {	#remove any Name style words
49
		$removeline = 1;
50
	}
51
	elsif ($line=~ /'.*$/gm) {	#remove any words with apostrophy
52
		$removeline = 1;
53
	}
54
	
55
	# push lines number to the array, reset the removeline flag to 0 before looping back
56
	if ($removeline == 1) {
57
		push (@badlines, $linecount);
58
		print "Bad line $linecount -> $line";
59
		$removeline = 0;
60
	}
61
62
}
63
close (SPELLFILE); # close the file
64
65
66
print "Preview complete, press [Enter] to remove words from list or ^C to abort! ";
67
<STDIN>;   #wait for input
68
69
70
#open the original file and a destination file.
71
open SPELLFILE, $filename or die "error opening file $filename\n";
72
open SPELLFILEMOD, ">$filenamemod" or die "error creating file $filenamemod\n"; 
73
open SPELLFILEBAD, ">$filenamebad" or die "error creating file $filenamebad\n";
74
75
$linecount = 0;  #reset line counter to zero.
76
$removeline = 0; #reuse this variable.
77
 
78
79
#checks if the line number of the original file matches the first element of the array [0], if it does then it is a badline
80
#and it wont be printed to the destination file.  The removeline variable is incremented for the next array element.
81
82
while ($line=<SPELLFILE>) {
83
	$linecount++;
84
	if ($badlines[$removeline]==$linecount) {
85
		print "Removing line $linecount -> $line";
86
		print SPELLFILEBAD "$line";
87
		$removeline++;
88
	}
89
	else {
90
		 print SPELLFILEMOD "$line";
91
		 $modlinecount++;
92
	 }
93
}
94
#close the files
95
close SPELLFILE;
96
close SPELLFILEMOD;
97
close SPELLFILEBAD;
98
99
# summary
100-
print "Cleaned file is saved as: $filenamemod\n";
100+
101
print "Total lines in the original file is: $linecount\n";
102
print "Total lines removed is: ".scalar(@badlines)."\n";
103
print "Cleaned file has: $modlinecount lines\n";
104
print "Cleaned file is saved as: $filenamemod\n";
105
print "Bad lines file is saved as: $filenamebad\n";