Advertisement
Guest User

momoy.php

a guest
Jan 28th, 2020
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 6.74 KB | None | 0 0
  1. <?php
  2.   // Include Composer autoloader if not already done.
  3.   include 'vendor/autoload.php';
  4.   require('preprocess.php');
  5.   require('stemming.php');
  6.   // Parse pdf file and build necessary objects.
  7.  
  8.   $paths = glob('dokumen/*'); // membaca semua file dalam folder dokumen
  9.   $docs = array();
  10.  
  11.   array_pop($paths);
  12.  
  13.   // print_r($paths);
  14.  
  15.   foreach($paths as $path){ // iterasi dokumen
  16.     if(is_file($path)){
  17.       $ext = pathinfo($path, PATHINFO_EXTENSION);
  18.  
  19.       if($ext=='pdf'){
  20.         $parser = new \Smalot\PdfParser\Parser();
  21.  
  22.         $pdf = $parser->parseFile($path);
  23.  
  24.         $preview = $pdf->getText();
  25.  
  26.         $preprocess = new preprocess($preview);
  27.         $keywords = $preprocess->preprocess(trim($preview));
  28.  
  29.         foreach ($keywords as $k => $value) {
  30.           $keywords[$k] = proses(trim($value));
  31.         }
  32.         $keywords = array_filter($keywords);
  33.         $keywords = array_values($keywords);
  34.  
  35.         $docs[] = $keywords;
  36.       }else if($ext=='docx'){
  37.         $x = read_docx($path);
  38.  
  39.         $preprocess = new preprocess($x);
  40.         $keywords = $preprocess->preprocess(trim($x));
  41.  
  42.         foreach ($keywords as $k => $value) {
  43.           $keywords[$k] = proses(trim($value));
  44.         }
  45.         $keywords = array_filter($keywords);
  46.         $keywords = array_values($keywords);
  47.  
  48.         $docs[] = $keywords;
  49.       }else if($ext=='txt'){
  50.         $x = file_get_contents($path);
  51.  
  52.         $preprocess = new preprocess($x);
  53.         $keywords = $preprocess->preprocess(trim($x));
  54.  
  55.         foreach ($keywords as $k => $value) {
  56.           $keywords[$k] = proses(trim($value));
  57.         }
  58.         $keywords = array_filter($keywords);
  59.         $keywords = array_values($keywords);
  60.  
  61.         $docs[] = $keywords;
  62.       }
  63.     }
  64.   }
  65.  
  66.   echo "<br>";
  67.  
  68.   // print_r($paths);
  69.   // Penetuan Query
  70.   $queryWord = array(); //kata query
  71.   foreach ($docs as $key => $doc) {
  72.     foreach ($doc as $k => $q) {
  73.       if(!in_array($q, $queryWord)){
  74.         $queryWord[] = $q;
  75.       }
  76.     }
  77.   }
  78.   // print_r($queryWord);
  79.   // Penetuan Query
  80.  
  81.   $tempCountVal = array();
  82.   foreach ($docs as $key => $doc) {
  83.     $countVal = array_count_values($doc);
  84.     // print_r($doc);
  85.     // echo "<br/>";
  86.     $tempCountVal[] = $countVal;
  87.     // print_r($countVal);
  88.     // echo "<br/>";
  89.     // echo "<br/>";
  90.   }
  91.   // print_r($tempCountVal);
  92.  
  93.   $bobotTerm = array();
  94.   foreach ($queryWord as $key => $value) {
  95.     // echo "<br>".$key.". ".$value."<br>";
  96.  
  97.     $q = array();
  98.     foreach ($tempCountVal as $keyVal => $val) {
  99.       $tempVal;
  100.       foreach ($val as $k => $v) {
  101.         if($value==$k){
  102.           $tempVal = $v;
  103.           break;
  104.         }
  105.         else {
  106.           $tempVal = 0;
  107.         }
  108.       }
  109.  
  110.       $q[$keyVal] = $tempVal;
  111.     }
  112.  
  113.     $qq = array();
  114.  
  115.     $qq['kata'] = $value;
  116.     $qTemp = array();
  117.     foreach ($q as $q1 => $q2) {
  118.       $nFile = str_replace('dokumen/', '', $paths[$q1]);
  119.       $nFile = str_replace('.pdf', '', $nFile);
  120.       $qTemp[$q1] = $q2;
  121.     }
  122.     $qq['bobot'] = $qTemp;
  123.  
  124.     $bobotTerm[] = $qq;
  125.   }
  126.   // print_r($bobotTerm);
  127. ?>
  128.  
  129. <!-- tabel bobot term -->
  130. <table class="table table-responsive table-striped table-hover">
  131.   <thead>
  132.     <tr>
  133.       <th align="center" style="width: 5%">No. </th>
  134.       <th>Term. </th>
  135.       <?php
  136.         $listDoc = array();
  137.         foreach ($paths as $key => $path) {
  138.           $nFile = str_replace('dokumen/', '', $path);
  139.           $nFile = str_replace('.pdf', '', $nFile);
  140.           $listDoc[$key] = $nFile;
  141.               echo ($key+1).". ".$nFile."<br>";
  142.           echo "<th align='center' style='width:15%'>".$nFile."</th>";
  143.         }
  144.  
  145.       ?>
  146.     </tr>
  147.   </thead>
  148.   <tbody>
  149.     <?php
  150.       foreach ($bobotTerm as $key => $value) {
  151.         echo "<tr>";
  152.           echo "<td>".($key+1)."</td>";
  153.           echo "<td>".($value['kata'])."</td>";
  154.           foreach ($value['bobot'] as $v => $val) {
  155.             $nFile = str_replace('dokumen/', '', $paths[$v]);
  156.             $nFile = str_replace('.pdf', '', $nFile);
  157.             echo "<td>".($val)."</td>";
  158.           }
  159.         echo "</tr>";
  160.       }
  161.     ?>
  162.   </tbody>
  163. </table>
  164. <!-- tabel bobot term -->
  165.  
  166. <?php
  167.   // perhitungan persilangan dokumen
  168.   $cosine = array();
  169.   foreach ($bobotTerm as $key => $value) {
  170.     for($i=0; $i<sizeof($paths); $i++){
  171.       $cosTemp = array();
  172.       for($j=0; $j<sizeof($paths); $j++){
  173.         if($i<$j){
  174.           $crossTemp = $value['bobot'][$i]*$value['bobot'][$j];
  175.           $cosine[$i." & ".$j][] = $crossTemp;
  176.         }else{
  177.           #echo "nope";
  178.       }
  179.       }
  180.     }
  181.   }
  182.   // print_r($cosine);
  183.   // perhitungan persilangan dokumen
  184.  
  185.  
  186.   // hitung cosine similarity
  187.   $totalArr = array();
  188.   foreach ($cosine as $key => $value) {
  189.     $tot = 0;
  190.     $calculate = 0;
  191.  
  192.     // echo "(".$key.") ";
  193.     foreach ($value as $k => $val) {
  194.     //   if(sizeof($value)-1>$k){
  195.     //     echo $val."+";
  196.     //   }else{
  197.     //     echo $val;
  198.     //   }
  199.       $tot += $val;
  200.     }
  201.     // echo " = ".$tot;
  202.     // echo "<br/>";
  203.     $data = explode(" & ", $key);
  204.     $tArr = array();
  205.     foreach ($data as $d => $dv) {
  206.       $totalSqrt = 0;
  207.       foreach ($bobotTerm as $bkey => $value) {
  208.         // if(sizeof($bobotTerm)-1>$bkey)
  209.         //   echo ($value['bobot'][$dv]*$value['bobot'][$dv])."+";
  210.         // else{
  211.         //   echo ($value['bobot'][$dv]*$value['bobot'][$dv]);
  212.         // }
  213.         $totalSqrt+=($value['bobot'][$dv]*$value['bobot'][$dv]);
  214.       }
  215.       // echo " = ".$totalSqrt;
  216.       // echo "<br/>";
  217.       $tArr[] = $totalSqrt;
  218.     }
  219.  
  220.     $sqrtTemp = 1;
  221.     // echo "total = ".$tot."/(";
  222.     foreach ($tArr as $t1 => $tval) {
  223.       $sqrtTemp = sqrt($tval)*$sqrtTemp;
  224.       // if(sizeof($tArr)-1>$t1){
  225.       //   echo "v".$tval." x ";
  226.       // }else {
  227.       //   echo "v".$tval;
  228.       // }
  229.     }
  230.     // echo ") = ";
  231.     // echo round($tot/$sqrtTemp, 3);
  232.     $totalArr[$key] = round($tot/$sqrtTemp, 3);
  233.     // print_r($totalArr);
  234.     // echo "<br/><br/>";
  235.   }
  236.   // hitung cosine similarity
  237.   // print_r($totalArr);
  238.   // hitung persentasi
  239.   $seratus = array_sum($totalArr);
  240.   //foreach ($totalArr as $key => $value) {
  241.     //echo $key." -> ".round((($value/$seratus)*100), 3)."<br/>";
  242.   //}
  243.   // hitung persentasi
  244.  
  245.   function read_docx($filename){
  246.  
  247.     $striped_content = '';
  248.     $content = '';
  249.  
  250.     if(!$filename || !file_exists($filename)) return false;
  251.  
  252.     $zip = zip_open($filename);
  253.     if (!$zip || is_numeric($zip)) return false;
  254.  
  255.     while ($zip_entry = zip_read($zip)) {
  256.  
  257.         if (zip_entry_open($zip, $zip_entry) == FALSE) continue;
  258.  
  259.         if (zip_entry_name($zip_entry) != "word/document.xml") continue;
  260.  
  261.         $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
  262.  
  263.         zip_entry_close($zip_entry);
  264.     }
  265.     zip_close($zip);
  266.     $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
  267.     $content = str_replace('</w:r></w:p>', "\r\n", $content);
  268.     $striped_content = strip_tags($content);
  269.  
  270.     return $striped_content;
  271.   }
  272.  
  273. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement