Advertisement
Guest User

try1

a guest
Feb 23rd, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.55 KB | None | 0 0
  1. <?php
  2.  
  3. error_reporting(-1);
  4. ini_set('display_errors', true);
  5. stream_context_set_default(
  6. array(
  7. 'http' => array(
  8. 'method' => 'HEAD'
  9. )
  10. )
  11. );
  12.  
  13. $links = [];
  14. $checked = [];
  15. loadRecursive(getMainUrl(), $links, $checked);
  16. var_dump($links);
  17.  
  18.  
  19. function getMainUrl()
  20. {
  21. return 'http://newsite/'; // url сайта
  22. }
  23.  
  24.  
  25. function loadRecursive($url, array &$found, array &$checked)
  26. {
  27. if (in_array($url, $checked)) {
  28. return;
  29. }
  30. $checked[] = $url;
  31. $html = loadHTML($url);
  32. $links = findLinks($html);
  33. foreach ($links as $link) {
  34.  
  35. $link = getMainUrl() . $link;
  36. $linkKey = md5($link);
  37. if ($link === $url) {
  38. continue;
  39. }
  40.  
  41. $found[$linkKey] = $link;
  42. loadRecursive($link, $found, $checked);
  43.  
  44. }
  45.  
  46. }
  47.  
  48. function urlIsOk($url)
  49. {
  50.  
  51. $headers = @get_headers($url, 1);
  52. if (!$headers) {
  53. return false;
  54. }
  55. if(!preg_match("~(\\d{3})~",$headers[0], $matches)){
  56. return false;
  57. }
  58. $statusCode = (int)$matches[0];
  59. if($statusCode < 200 || $statusCode > 400){
  60. return false;
  61. }
  62. if ($headers['Content-Type'] !== 'text/html') {
  63. return false;
  64. }
  65.  
  66. return true;
  67. }
  68.  
  69. function loadHTML($url)
  70. {
  71. if (!urlIsOk($url)) {
  72. return '';
  73. }
  74. $curl = curl_init();
  75. curl_setopt($curl, CURLOPT_URL, $url);
  76. curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
  77. curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
  78. curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0);
  79. curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
  80. $result = curl_exec($curl);
  81. curl_close($curl);
  82. if ($result === false) {
  83. //echo 'broken url';
  84. return '';
  85. }
  86.  
  87. return (string)$result;
  88. }
  89.  
  90. function findLinks($html)
  91. {
  92. $domDocument = new DOMDocument();
  93. @$domDocument->loadHTML($html);
  94.  
  95. $xpath = new DOMXPath($domDocument);
  96.  
  97. /**
  98. * @var DOMNodeList $elements
  99. */
  100. $elements = $xpath->query('//a[@href]'); //найдти все елементы которыи имеют href аттрибут
  101. $links = [];
  102.  
  103. foreach ($elements as $element) {
  104. $href = $element->getAttribute('href');
  105. if ($href === '') {
  106. continue;
  107. }
  108. $isAbsolute = strpos($href, 'http') !== false;
  109. $isAnchor = strpos($href, '#') !== false;
  110. if ($isAbsolute || $isAnchor) {
  111. continue;
  112. }
  113. $links[] = $href;
  114. }
  115.  
  116. return array_unique($links);
  117. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement