Advertisement
janci

Rouming kiwi scraper

Sep 24th, 2015
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 1.48 KB | None | 0 0
  1. <?php
  2.  
  3. $pages = Array();
  4.  
  5. /*
  6. $data = file_get_contents("http://www.rouming.cz/roumingKiwiList.php");
  7. $links = get_links($data);
  8. foreach($links as $page){
  9.     $html = file_get_contents("http://www.rouming.cz/roumingKiwi.php?page=".$page);
  10.     $pages[$page] = get_links($html);
  11. }
  12. file_put_contents("out.json",json_encode($pages));
  13. */
  14. $pages = json_decode(file_get_contents("out.json"), true);
  15. $exist = array_map("slug", array_keys($pages));
  16.  
  17.  
  18.  
  19. $pagelist = Array();
  20. $missing = Array();
  21. foreach($pages as $key => $list){
  22.     $pagelist[] = slug($key);
  23.     foreach($list as $page){
  24.         $pagelist[] = slug($page);
  25.     }
  26. }
  27. $pagelist = array_unique($pagelist);
  28. $missing = array_unique($missing);
  29.  
  30.  
  31. $out = Array();
  32. foreach($pages as $page => $to){
  33.     foreach($to as $t){
  34.         $dest = array_search(slug($t), $pagelist);
  35.         if($dest === false){
  36.             $out[] = "# missing $t \n";
  37.         }
  38.         $out[] = "  x".array_search(slug($page), $pagelist)." -> x".$dest."[color=\"#000000aa\"];\n";
  39.     }
  40. }
  41.  
  42. $out = array_unique($out);
  43. echo "digraph G {\n";
  44. echo "overlap = false\n";
  45. echo "truecolor = true\n";
  46. foreach($pagelist as $i=>$key){
  47.     echo "x$i [label=\"$key\"";
  48.     if(array_search(slug($key), $exist) === false){
  49.         echo ", fillcolor=red, fontcolor=white, style=filled";
  50.     }
  51.     echo "]";
  52.     echo "\n";
  53. }
  54.  
  55. echo implode("", $out);
  56. echo "}\n";
  57.  
  58.  
  59. function get_links($data){
  60.     $match = Array();
  61.     preg_match_all('/\?page=(.+?)(?=")/', $data, $match);
  62.     return $match[1];
  63. }
  64.  
  65. function slug($x){
  66.     return strtolower($x);
  67. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement