Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use locale;
- use utf8;
- use encoding 'utf8';
- ####################### config variables
- $operation;
- $input;
- $output;
- $minCharCount;
- $minFrequency;
- ####################### procedures and functions
- sub readConfig
- {
- my $source = shift(@_);
- if(open(SOURCE,$source))
- {
- while ($row = <SOURCE>)
- {
- chomp($row);
- push(@arrayRow,$row);
- }
- $operation = uc($arrayRow[0]);
- $input = $arrayRow[1];
- $output = $arrayRow[2];
- $minCharCount = $arrayRow[3];
- $minFrequency = $arrayRow[4];
- }
- else
- {
- print "FATAL ERROR: unable to read configuration\n";
- exit();
- }
- if($operation ne "TP")
- {
- if($operation ne "TF")
- {
- if($operation ne "TF_IDF")
- {
- print "FATAL ERROR: unknown operation!\n";
- print "HINT: use tp, tf or tf_idf only.\n";
- exit();
- }}}
- }
- sub writeConfig()
- {
- print "## Configuration file: $ARGV[0] ##\n";
- print "Operation: $operation\n";
- print "Input file: $input\n";
- print "Output file: $output\n";
- print "Minimal char count: $minCharCount\n";
- print "Minimal Frequency: $minFrequency\n";
- print "###########################################\n";
- }
- sub getWords
- {
- open(DATA, $input);#otevre soubor zadany argumentem
- @radky=<DATA>; #rozdeleno po radcich
- $pocetSouboru=0; #pocitadlo
- for $lala (@radky) #pro kazde slovo na predhozenem radku
- {
- $lala =~s/^.+\t//; #odstraneni tridy
- $lala =~s/\n//; #odstraneni konce radku
- $lala =~s/<(.*)>//; #odstraneni tagu
- push(@slova, split(/\s+/, $lala)); #pridani do vektoru slov
- $pocetSouboru++; #pocet souboru +1
- }
- $pocetSouboru--;#odecteni posledniho prazdneho radku
- $counter=0;
- for $lala (@slova) #prazdnych slov
- {
- $lala=lc($lala);#lower case
- $lala =~s/\s//; #prazdne znaky
- $lala =~s/[^a-zA-Z]//;#jiny znaky nez pismena
- if($lala eq "")
- {
- splice(@slova, $counter, 1);#odstraneni prazdnych veci
- };
- $counter++;
- }
- $counter=0;
- if($minFrequency > 1) #ma smysl jen pokud minimalni vyskyt je vetsi nez 1
- {
- #print "-> minFrequency greater than 1\n";
- for($counter=0;$counter<=$#slova;$counter++) #minimalni vyskyt
- {
- $i=0;
- #print"-> testuji slovo: ( $slova[$i] ) |> $slova[$counter]\n";
- $frequency=0; #nulovy vyskyt
- for($i=0;$i<=$#slova;$i++)
- {
- #print" $slova[$counter] -> $slova[$i] \n";
- if($slova[$i] eq $slova[$counter])
- {
- $frequency++;
- #print"Stejna slova! F: $frequency\n";
- }
- }
- if($frequency < $minFrequency)
- {
- #print "odstranuji slovo: $slova[$counter]\n";
- splice(@slova, $counter, 1);
- }
- #print"####################\n\n";
- }
- }
- $counter=0;
- for $lala (@slova)#odstraneni duplicit
- {
- for($i=0;$i<$counter;$i++)
- {
- if($slova[$i] eq $slova[$counter])
- {
- splice(@slova, $i, 1);
- $i=0;#nutny reset
- }
- }
- $counter++;
- }
- $counter = 0;
- for $lala(@slova)#ostrani slova kratsi nez pozadovane
- {
- if(length($lala) <= $minCharCount)
- {
- #print "nalezeno slovo: $lala\n";
- #print length($lala);
- #print "\n";
- splice(@slova, $counter, 1);
- }
- $counter++;
- }
- return @slova; #vraci vektor cistych slov
- }
- sub writeWords
- {
- print "\n## Vectore of clear word ##\n";
- for $lala (@words)
- {
- $lala=uc($lala);
- $lala =~s/\s//;
- print $lala;
- print " ";
- }
- print "\n## End of vector ##\n\n";
- }
- sub TP
- {
- open(DATA, $input); #otevre soubor zadany argumentem
- @radky=<DATA>; #rozdeleno po radcich
- for $lala (@slova)
- {
- $lala=uc($lala);
- $lala =~s/\s//;
- print $lala;
- print " ";
- }
- print "_CLASS_\n";
- for $radek (@radky)
- {
- $class = $radek;
- $radek =~s/\n//;
- $class =~s/\t.*\n//;
- $radek =~s/^.*\t//;
- $radek=uc($radek);
- for $word (@words)
- {
- $word=uc($word);
- if(index(" ".$radek." "," ".$word." ") != -1)
- {
- print "1";
- for($i=0;$i<(length($word)+2);$i++)
- {
- print " ";
- }
- }
- else
- {
- print "0";
- for($i=0;$i<(length($word)+2);$i++)
- {
- print " ";
- }
- }
- }
- print "$class\n";
- }
- }
- sub TF
- {
- open(DATA, $input); #otevre soubor zadany argumentem
- @radky=<DATA>; #rozdeleno po radcich
- for $lala (@slova)
- {
- $lala=uc($lala);
- $lala =~s/\s//;
- print $lala;
- print " ";
- }
- print "_CLASS_\n";
- for $radek (@radky)
- {
- $class = $radek;
- $radek =~s/\n//;
- $class =~s/\t.*\n//;
- $radek =~s/^.*\t//;
- $radek=uc($radek);
- @wordsOnRow = split(' ',$radek);
- for $word (@words)
- {
- $word=uc($word);
- $sum=0;
- for $wordOnRow (@wordsOnRow)
- {
- if($wordOnRow eq $word)
- {
- $sum++;
- }
- }
- print "$sum";
- for($i=0;$i<(length($word)+2);$i++)
- {
- print " ";
- }
- }
- print "$class\n";
- }
- }
- sub TF_IDF
- {
- open(DATA, $input); #otevre soubor zadany argumentem
- @radky=<DATA>; #rozdeleno po radcich
- $docCount = @radky; # N -> log(N) -> pocet dokumentu
- $docIncCount = 0; # n(t_i) -> pocet dokumentu obsahujici vyraz
- $countInDoc = 0; # n_ij -> pocet vyskytu v dokumentu
- $sumInDoc = 0; # suma n_ij -> pocet termu v danem dokumentu
- ## TF-IDF = TF * IDF = n_ij / suma n_ij * log N / n(t_i)
- for $lala (@slova)
- {
- $lala=uc($lala);
- $lala =~s/\s//;
- print $lala;
- print " ";
- }
- print "_CLASS_\n";
- @pomRows = @radky;
- for $radek (@radky)
- {
- $class = $radek;
- $radek =~s/\n//;
- $class =~s/\t.*\n//;
- $radek =~s/^.*\t//;
- $radek=uc($radek);
- @wordsOnRow = split(' ',$radek);
- for $word (@words)
- {
- $word=uc($word);
- $countInDoc=0; #pocet vyhovujicich
- $sumInDoc=0; #celkovy pocet
- $docInCount=0;
- for $wordOnRow (@wordsOnRow)
- {
- for $row (@pomRows)
- {
- $row =~s/\n//;
- $row =~s/^.*\t//;
- $row=uc($row);
- @wordsOnPomRow = split(' ',$row);
- for $wordOnPomRow (@wordsOnPomRow)
- {
- if($wordOnPomRow eq $word)
- {
- $docInCount++;
- last;
- }
- }
- }
- if($wordOnRow eq $word)
- {
- $countInDoc++;
- }
- $sumInDoc++;
- }
- # TF-IDF = TF * IDF = n_ij / suma n_ij * log N / n(t_i)
- $TF = $countInDoc / $sumInDoc;
- if($countInDoc <= 0)
- {
- $result=0;
- }
- else
- {
- $IDF = log($docCount)/$countInDoc;
- $result = $TF*$IDF;
- }
- if($result == 0) {print "$result ";}
- else { printf("%.2f", $result);}
- for($i=0;$i<(length($word)+2);$i++)
- {
- print " ";
- }
- }
- print "$class\n";
- }
- }
- ####################### main program
- readConfig($ARGV[0]);
- writeConfig();
- @words = getWords();
- writeWords();
- if($operation eq "TP")
- {
- TP();
- }
- elsif($operation eq "TF")
- {
- TF();
- }
- elsif($operation eq "TF_IDF")
- {
- TF_IDF();
- }
- else
- {
- print"FATAL ERROR: unknown operation\n";
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement