Guest User

Untitled

a guest
Sep 5th, 2016
206
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.58 KB | None | 0 0
  1. #!/usr/bin/perl
  2. use utf8;
  3. use Encode;
  4.  
  5. my @words;
  6.  
  7. opendir(D,"/tmp/data");
  8. @d=readdir(D);
  9. close(D);
  10. for ($j=2;$j<=$#d;$j++)
  11. {
  12. my $t;
  13. open(F,"< /tmp/data/".$d[$j]);
  14. while(<F>)
  15. {
  16. $t .= lc(decode_utf8($_));
  17. }
  18. $t =~ s/<a [^>]+>[^<]+<[^>]+>/ /g;
  19. $t =~ s/<br>/. /g;
  20. $t =~ s/<[^>]+>/ /g;
  21. $t =~ s/>>\d+/ /g;
  22. $t =~ s/(&#[a-f0-9]+;|&[^;]{1,6};)/ /g;
  23. #$t =~ s/[[:punct:]]/ /g;
  24. $t =~ s/[,.!?_&#'"()\[\]=:;]/ /g;
  25. $t =~ s/ \d+d\d+ / /g;
  26. $t =~ s/ +d / /g;
  27. $t =~ s/ [0-9]+ / /g;
  28. $t =~ s/ [0-9]+ / /g;
  29. $t =~ s/ [[:punct:]]*[0-9]+[[:punct:]]* / /g;
  30. $t =~ s/ [[:punct:]]*[0-9]+[[:punct:]]* / /g;
  31. $t =~ s/[\r\n]/ /g;
  32. $t =~ s/\s+/ /g;
  33. $t =~ s/\s+/ /g;
  34. #print $t;
  35. my @wt=($t =~ m/ ([^ ]+)/g);
  36. #print $wt[rand()*$#wt]," " for (0..10);
  37. #print "\n",$#wt,"\n";
  38. for ($i=0;$i<=$#wt;$i++)
  39. {
  40. $words[$j-2]{$wt[$i]}++;
  41. }
  42. close(F);
  43. }
  44.  
  45.  
  46.  
  47. my @idf;
  48. for($n=0;$n<=$#words;$n++)
  49. {
  50. foreach my $word (keys $words[$n])
  51. {
  52. $N=0;
  53.  
  54. for($i=0;$i<=$#words;$i++)
  55. {
  56. if(defined($words[$i]{$word}) && ($words[$i]{$word} >= 0))
  57. {
  58. $N++;
  59. }
  60. }
  61. #if($#words+1-$N==0) { $idf{"$word"} == 0 }
  62. #else
  63. #{
  64. $idf[$n]{"$word"} = log(($#words+1)/$N);
  65. $idfN[$n]{"$word"} = $N;
  66. #}
  67. }
  68. }
  69.  
  70. for($n=0;$n<=$#words;$n++)
  71. {
  72. open(OUT,"> /tmp/data_out/".$d[$n+2]);
  73. foreach my $word (sort { $words[$n]{$b} * $idf[$n]{$b} <=> $words[$n]{$a} * $idf[$n]{$a} } keys %{$words[$n]})
  74. {
  75. printf OUT "%4d * %2.1f (%2d) = %4.0f\t%s\n", $words[$n]{$word}, $idf[$n]{"$word"}, $idfN[$n]{"$word"}, $words[$n]{$word}*$idf[$n]{"$word"}, $word;
  76. }
  77. close(OUT);
  78. }
Add Comment
Please, Sign In to add comment