Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2017
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 3.20 KB | None | 0 0
  1. <?php
  2. error_reporting(-1);
  3. ini_set('display_errors', true);
  4. stream_context_set_default(
  5.     array(
  6.         'http' => array(
  7.             'method' => 'HEAD'
  8.         )
  9.     )
  10. );
  11.  
  12. $links = [];
  13. $checked = [];
  14. loadRecursive(getMainUrl(), $links, $checked);
  15.  
  16.  
  17. function getMainUrl()
  18. {
  19.     return 'http://theory.phphtml.net';
  20. }
  21.  
  22. function loadRecursive1($url, array &$found, array &$checked)
  23. {
  24.  
  25.     echo "Parsing " . $url . " <br/>";
  26.     $checked[] = $url;
  27.     $html = loadHTML($url);
  28.     $links = findLinks($html);
  29.     $counter = 0;
  30.     foreach ($links as $link) {
  31.  
  32.         $link = getMainUrl() . $link;
  33.         $linkKey = md5($link);
  34.         if ($link === $url) {
  35.             continue;
  36.         }
  37.  
  38.         $found[$linkKey] = $link;
  39.         $counter++;
  40.  
  41.         //  loadRecursive1($link, $found, $checked);
  42.  
  43.     }
  44.     echo "Found " . $counter . " links on " . $url . "<hr/>";
  45. }
  46.  
  47.  
  48. function loadRecursive($url, array &$found, array &$checked)
  49. {
  50.  
  51.     echo "Parsing " . $url . " <br/>";
  52.     $checked[] = $url;
  53.  
  54.     $html = loadHTML($url);
  55.     $links = findLinks($html);
  56.     $counter = 0;
  57.     foreach ($links as $link) {
  58.  
  59.         $link = getMainUrl() . $link;
  60.         $linkKey = md5($link);
  61.         if ($link === $url) {
  62.             continue;
  63.         }
  64.  
  65.         $found[$linkKey] = $link;
  66.         $counter++;
  67.     }
  68.     foreach ($found as $link) {
  69.         loadRecursive1($link, $found, $checked);
  70.     }
  71.  
  72.     echo "Found " . $counter . " links on " . $url . "<hr/>";
  73. }
  74.  
  75. function urlIsOk($url)
  76. {
  77.  
  78.     $headers = @get_headers($url, 1);
  79.     if (!$headers) {
  80.         return false;
  81.     }
  82.     if (!preg_match("~(\\d{3})~", $headers[0], $matches)) {
  83.         return false;
  84.     }
  85.     $statusCode = (int)$matches[0];
  86.     if ($statusCode < 200 || $statusCode > 400) {
  87.         return false;
  88.     }
  89.     if ($headers['Content-Type'] !== 'text/html') {
  90.         return false;
  91.     }
  92.  
  93.     return true;
  94. }
  95.  
  96. function loadHTML($url)
  97. {
  98.     if (!urlIsOk($url)) {
  99.         return '';
  100.     }
  101.     $curl = curl_init();
  102.     curl_setopt($curl, CURLOPT_URL, $url);
  103.     curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
  104.     curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
  105.     curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0);
  106.     curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
  107.     $result = curl_exec($curl);
  108.     curl_close($curl);
  109.     if ($result === false) {
  110.         //echo 'broken url';
  111.         return '';
  112.     }
  113.  
  114.     return (string)$result;
  115. }
  116.  
  117. function findLinks($html)
  118. {
  119.     $domDocument = new DOMDocument();
  120.     @$domDocument->loadHTML($html);
  121.  
  122.     $xpath = new DOMXPath($domDocument);
  123.  
  124.     /**
  125.      * @var DOMNodeList $elements
  126.      */
  127.     $elements = $xpath->query('//a[@href]'); //найдти все елементы которыи имеют href аттрибут
  128.     $links = [];
  129.  
  130.     foreach ($elements as $element) {
  131.         $href = $element->getAttribute('href');
  132.         if ($href === '') {
  133.             continue;
  134.         }
  135.         $isAbsolute = strpos($href, 'http') !== false;
  136.         $isAnchor = strpos($href, '#') !== false;
  137.         if ($isAbsolute || $isAnchor) {
  138.             continue;
  139.         }
  140.         $links[] = $href;
  141.     }
  142.  
  143.     return array_unique($links);
  144. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement