Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2017
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. <?php
  2. error_reporting(-1);
  3. ini_set('display_errors', true);
  4. stream_context_set_default(
  5.     array(
  6.         'http' => array(
  7.             'method' => 'HEAD'
  8.         )
  9.     )
  10. );
  11.  
  12. $links = [];
  13. $checked = [];
  14. loadRecursive(getMainUrl(), $links);
  15.  
  16. var_dump($links);
  17. function getMainUrl()
  18. {
  19.     return 'http://theory.phphtml.net';
  20. }
  21.  
  22. function loadRecursive1($url, array &$found)
  23. {
  24.  
  25.     echo "Parsing " . $url . " <br/>";
  26.  
  27.     $html = loadHTML($url);
  28.     $links = findLinks($html);
  29.     $counter = 0;
  30.     foreach ($links as $link) {
  31.  
  32.         $link = getMainUrl() . $link;
  33.         $linkKey = md5($link);
  34.         if ($link === $url) {
  35.             continue;
  36.         }
  37.  
  38.         $found[$linkKey] = $link;
  39.         $counter++;
  40.  
  41.  
  42.     }
  43.     echo "Found " . $counter . " links on " . $url . "<hr/>";
  44. }
  45.  
  46.  
  47. function loadRecursive($url, array &$found)
  48. {
  49.  
  50.     echo "Parsing " . $url . " <br/>";
  51.  
  52.  
  53.     $html = loadHTML($url);
  54.     $links = findLinks($html);
  55.     $counter = 0;
  56.     foreach ($links as $link) {
  57.  
  58.         $link = getMainUrl() . $link;
  59.         $linkKey = md5($link);
  60.         if ($link === $url) {
  61.             continue;
  62.         }
  63.  
  64.         $found[$linkKey] = $link;
  65.         $counter++;
  66.     }
  67.     foreach ($found as $link) {
  68.         loadRecursive1($link, $found);
  69.     }
  70.  
  71.     echo "Found " . $counter . " links on " . $url . "<hr/>";
  72. }
  73.  
  74. function urlIsOk($url)
  75. {
  76.  
  77.     $headers = @get_headers($url, 1);
  78.     if (!$headers) {
  79.         return false;
  80.     }
  81.     if (!preg_match("~(\\d{3})~", $headers[0], $matches)) {
  82.         return false;
  83.     }
  84.     $statusCode = (int)$matches[0];
  85.     if ($statusCode < 200 || $statusCode > 400) {
  86.         return false;
  87.     }
  88.     if ($headers['Content-Type'] !== 'text/html') {
  89.         return false;
  90.     }
  91.  
  92.     return true;
  93. }
  94.  
  95. function loadHTML($url)
  96. {
  97.     if (!urlIsOk($url)) {
  98.         return '';
  99.     }
  100.     $curl = curl_init();
  101.     curl_setopt($curl, CURLOPT_URL, $url);
  102.     curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
  103.     curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
  104.     curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0);
  105.     curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
  106.     $result = curl_exec($curl);
  107.     curl_close($curl);
  108.     if ($result === false) {
  109.         //echo 'broken url';
  110.         return '';
  111.     }
  112.  
  113.     return (string)$result;
  114. }
  115.  
  116. function findLinks($html)
  117. {
  118.     $domDocument = new DOMDocument();
  119.     @$domDocument->loadHTML($html);
  120.  
  121.     $xpath = new DOMXPath($domDocument);
  122.  
  123.     /**
  124.      * @var DOMNodeList $elements
  125.      */
  126.     $elements = $xpath->query('//a[@href]'); //найдти все елементы которыи имеют href аттрибут
  127.     $links = [];
  128.  
  129.     foreach ($elements as $element) {
  130.         $href = $element->getAttribute('href');
  131.         if ($href === '') {
  132.             continue;
  133.         }
  134.         $isAbsolute = strpos($href, 'http') !== false;
  135.         $isAnchor = strpos($href, '#') !== false;
  136.         if ($isAbsolute || $isAnchor) {
  137.             continue;
  138.         }
  139.         $links[] = $href;
  140.     }
  141.  
  142.     return array_unique($links);
  143. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement