Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use utf8;
- use Encode;
- my @words;
- opendir(D,"/tmp/data");
- @d=readdir(D);
- close(D);
- for ($j=2;$j<=$#d;$j++)
- {
- my $t;
- open(F,"< /tmp/data/".$d[$j]);
- while(<F>)
- {
- $t .= lc(decode_utf8($_));
- }
- $t =~ s/<a [^>]+>[^<]+<[^>]+>/ /g;
- $t =~ s/<br>/. /g;
- $t =~ s/<[^>]+>/ /g;
- $t =~ s/>>\d+/ /g;
- $t =~ s/(&#[a-f0-9]+;|&[^;]{1,6};)/ /g;
- #$t =~ s/[[:punct:]]/ /g;
- $t =~ s/[,.!?_&#'"()\[\]=:;]/ /g;
- $t =~ s/ \d+d\d+ / /g;
- $t =~ s/ +d / /g;
- $t =~ s/ [0-9]+ / /g;
- $t =~ s/ [0-9]+ / /g;
- $t =~ s/ [[:punct:]]*[0-9]+[[:punct:]]* / /g;
- $t =~ s/ [[:punct:]]*[0-9]+[[:punct:]]* / /g;
- $t =~ s/[\r\n]/ /g;
- $t =~ s/\s+/ /g;
- $t =~ s/\s+/ /g;
- #print $t;
- my @wt=($t =~ m/ ([^ ]+)/g);
- #print $wt[rand()*$#wt]," " for (0..10);
- #print "\n",$#wt,"\n";
- for ($i=0;$i<=$#wt;$i++)
- {
- $words[$j-2]{$wt[$i]}++;
- }
- close(F);
- }
- my @idf;
- for($n=0;$n<=$#words;$n++)
- {
- foreach my $word (keys $words[$n])
- {
- $N=0;
- for($i=0;$i<=$#words;$i++)
- {
- if(defined($words[$i]{$word}) && ($words[$i]{$word} >= 0))
- {
- $N++;
- }
- }
- #if($#words+1-$N==0) { $idf{"$word"} == 0 }
- #else
- #{
- $idf[$n]{"$word"} = log(($#words+1)/$N);
- $idfN[$n]{"$word"} = $N;
- #}
- }
- }
- for($n=0;$n<=$#words;$n++)
- {
- open(OUT,"> /tmp/data_out/".$d[$n+2]);
- foreach my $word (sort { $words[$n]{$b} * $idf[$n]{$b} <=> $words[$n]{$a} * $idf[$n]{$a} } keys %{$words[$n]})
- {
- printf OUT "%4d * %2.1f (%2d) = %4.0f\t%s\n", $words[$n]{$word}, $idf[$n]{"$word"}, $idfN[$n]{"$word"}, $words[$n]{$word}*$idf[$n]{"$word"}, $word;
- }
- close(OUT);
- }
Add Comment
Please, Sign In to add comment