Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- // Include Composer autoloader if not already done.
- include 'vendor/autoload.php';
- require('preprocess.php');
- require('stemming.php');
- // Parse pdf file and build necessary objects.
- $paths = glob('dokumen/*'); // membaca semua file dalam folder dokumen
- $docs = array();
- array_pop($paths);
- // print_r($paths);
- foreach($paths as $path){ // iterasi dokumen
- if(is_file($path)){
- $ext = pathinfo($path, PATHINFO_EXTENSION);
- if($ext=='pdf'){
- $parser = new \Smalot\PdfParser\Parser();
- $pdf = $parser->parseFile($path);
- $preview = $pdf->getText();
- $preprocess = new preprocess($preview);
- $keywords = $preprocess->preprocess(trim($preview));
- foreach ($keywords as $k => $value) {
- $keywords[$k] = proses(trim($value));
- }
- $keywords = array_filter($keywords);
- $keywords = array_values($keywords);
- $docs[] = $keywords;
- }else if($ext=='docx'){
- $x = read_docx($path);
- $preprocess = new preprocess($x);
- $keywords = $preprocess->preprocess(trim($x));
- foreach ($keywords as $k => $value) {
- $keywords[$k] = proses(trim($value));
- }
- $keywords = array_filter($keywords);
- $keywords = array_values($keywords);
- $docs[] = $keywords;
- }else if($ext=='txt'){
- $x = file_get_contents($path);
- $preprocess = new preprocess($x);
- $keywords = $preprocess->preprocess(trim($x));
- foreach ($keywords as $k => $value) {
- $keywords[$k] = proses(trim($value));
- }
- $keywords = array_filter($keywords);
- $keywords = array_values($keywords);
- $docs[] = $keywords;
- }
- }
- }
- echo "<br>";
- // print_r($paths);
- // Penetuan Query
- $queryWord = array(); //kata query
- foreach ($docs as $key => $doc) {
- foreach ($doc as $k => $q) {
- if(!in_array($q, $queryWord)){
- $queryWord[] = $q;
- }
- }
- }
- // print_r($queryWord);
- // Penetuan Query
- $tempCountVal = array();
- foreach ($docs as $key => $doc) {
- $countVal = array_count_values($doc);
- // print_r($doc);
- // echo "<br/>";
- $tempCountVal[] = $countVal;
- // print_r($countVal);
- // echo "<br/>";
- // echo "<br/>";
- }
- // print_r($tempCountVal);
- $bobotTerm = array();
- foreach ($queryWord as $key => $value) {
- // echo "<br>".$key.". ".$value."<br>";
- $q = array();
- foreach ($tempCountVal as $keyVal => $val) {
- $tempVal;
- foreach ($val as $k => $v) {
- if($value==$k){
- $tempVal = $v;
- break;
- }
- else {
- $tempVal = 0;
- }
- }
- $q[$keyVal] = $tempVal;
- }
- $qq = array();
- $qq['kata'] = $value;
- $qTemp = array();
- foreach ($q as $q1 => $q2) {
- $nFile = str_replace('dokumen/', '', $paths[$q1]);
- $nFile = str_replace('.pdf', '', $nFile);
- $qTemp[$q1] = $q2;
- }
- $qq['bobot'] = $qTemp;
- $bobotTerm[] = $qq;
- }
- // print_r($bobotTerm);
- ?>
- <!-- tabel bobot term -->
- <table class="table table-responsive table-striped table-hover">
- <thead>
- <tr>
- <th align="center" style="width: 5%">No. </th>
- <th>Term. </th>
- <?php
- $listDoc = array();
- foreach ($paths as $key => $path) {
- $nFile = str_replace('dokumen/', '', $path);
- $nFile = str_replace('.pdf', '', $nFile);
- $listDoc[$key] = $nFile;
- echo ($key+1).". ".$nFile."<br>";
- echo "<th align='center' style='width:15%'>".$nFile."</th>";
- }
- ?>
- </tr>
- </thead>
- <tbody>
- <?php
- foreach ($bobotTerm as $key => $value) {
- echo "<tr>";
- echo "<td>".($key+1)."</td>";
- echo "<td>".($value['kata'])."</td>";
- foreach ($value['bobot'] as $v => $val) {
- $nFile = str_replace('dokumen/', '', $paths[$v]);
- $nFile = str_replace('.pdf', '', $nFile);
- echo "<td>".($val)."</td>";
- }
- echo "</tr>";
- }
- ?>
- </tbody>
- </table>
- <!-- tabel bobot term -->
- <?php
- // perhitungan persilangan dokumen
- $cosine = array();
- foreach ($bobotTerm as $key => $value) {
- for($i=0; $i<sizeof($paths); $i++){
- $cosTemp = array();
- for($j=0; $j<sizeof($paths); $j++){
- if($i<$j){
- $crossTemp = $value['bobot'][$i]*$value['bobot'][$j];
- $cosine[$i." & ".$j][] = $crossTemp;
- }else{
- #echo "nope";
- }
- }
- }
- }
- // print_r($cosine);
- // perhitungan persilangan dokumen
- // hitung cosine similarity
- $totalArr = array();
- foreach ($cosine as $key => $value) {
- $tot = 0;
- $calculate = 0;
- // echo "(".$key.") ";
- foreach ($value as $k => $val) {
- // if(sizeof($value)-1>$k){
- // echo $val."+";
- // }else{
- // echo $val;
- // }
- $tot += $val;
- }
- // echo " = ".$tot;
- // echo "<br/>";
- $data = explode(" & ", $key);
- $tArr = array();
- foreach ($data as $d => $dv) {
- $totalSqrt = 0;
- foreach ($bobotTerm as $bkey => $value) {
- // if(sizeof($bobotTerm)-1>$bkey)
- // echo ($value['bobot'][$dv]*$value['bobot'][$dv])."+";
- // else{
- // echo ($value['bobot'][$dv]*$value['bobot'][$dv]);
- // }
- $totalSqrt+=($value['bobot'][$dv]*$value['bobot'][$dv]);
- }
- // echo " = ".$totalSqrt;
- // echo "<br/>";
- $tArr[] = $totalSqrt;
- }
- $sqrtTemp = 1;
- // echo "total = ".$tot."/(";
- foreach ($tArr as $t1 => $tval) {
- $sqrtTemp = sqrt($tval)*$sqrtTemp;
- // if(sizeof($tArr)-1>$t1){
- // echo "v".$tval." x ";
- // }else {
- // echo "v".$tval;
- // }
- }
- // echo ") = ";
- // echo round($tot/$sqrtTemp, 3);
- $totalArr[$key] = round($tot/$sqrtTemp, 3);
- // print_r($totalArr);
- // echo "<br/><br/>";
- }
- // hitung cosine similarity
- // print_r($totalArr);
- // hitung persentasi
- $seratus = array_sum($totalArr);
- //foreach ($totalArr as $key => $value) {
- //echo $key." -> ".round((($value/$seratus)*100), 3)."<br/>";
- //}
- // hitung persentasi
- function read_docx($filename){
- $striped_content = '';
- $content = '';
- if(!$filename || !file_exists($filename)) return false;
- $zip = zip_open($filename);
- if (!$zip || is_numeric($zip)) return false;
- while ($zip_entry = zip_read($zip)) {
- if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
- if (zip_entry_name($zip_entry) != "word/document.xml") continue;
- $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
- zip_entry_close($zip_entry);
- }
- zip_close($zip);
- $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
- $content = str_replace('</w:r></w:p>', "\r\n", $content);
- $striped_content = strip_tags($content);
- return $striped_content;
- }
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement