Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- error_reporting(-1);
- ini_set('display_errors', true);
- stream_context_set_default(
- array(
- 'http' => array(
- 'method' => 'HEAD'
- )
- )
- );
- $links = [];
- $checked = [];
- loadRecursive(getMainUrl(), $links, $checked);
- var_dump($links);
- function getMainUrl()
- {
- return 'http://newsite/'; // url сайта
- }
- function loadRecursive($url, array &$found, array &$checked)
- {
- if (in_array($url, $checked)) {
- return;
- }
- $checked[] = $url;
- $html = loadHTML($url);
- $links = findLinks($html);
- foreach ($links as $link) {
- $link = getMainUrl() . $link;
- $linkKey = md5($link);
- if ($link === $url) {
- continue;
- }
- $found[$linkKey] = $link;
- loadRecursive($link, $found, $checked);
- }
- }
- function urlIsOk($url)
- {
- $headers = @get_headers($url, 1);
- if (!$headers) {
- return false;
- }
- if(!preg_match("~(\\d{3})~",$headers[0], $matches)){
- return false;
- }
- $statusCode = (int)$matches[0];
- if($statusCode < 200 || $statusCode > 400){
- return false;
- }
- if ($headers['Content-Type'] !== 'text/html') {
- return false;
- }
- return true;
- }
- function loadHTML($url)
- {
- if (!urlIsOk($url)) {
- return '';
- }
- $curl = curl_init();
- curl_setopt($curl, CURLOPT_URL, $url);
- curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0);
- curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
- $result = curl_exec($curl);
- curl_close($curl);
- if ($result === false) {
- //echo 'broken url';
- return '';
- }
- return (string)$result;
- }
- function findLinks($html)
- {
- $domDocument = new DOMDocument();
- @$domDocument->loadHTML($html);
- $xpath = new DOMXPath($domDocument);
- /**
- * @var DOMNodeList $elements
- */
- $elements = $xpath->query('//a[@href]'); //найдти все елементы которыи имеют href аттрибут
- $links = [];
- foreach ($elements as $element) {
- $href = $element->getAttribute('href');
- if ($href === '') {
- continue;
- }
- $isAbsolute = strpos($href, 'http') !== false;
- $isAnchor = strpos($href, '#') !== false;
- if ($isAbsolute || $isAnchor) {
- continue;
- }
- $links[] = $href;
- }
- return array_unique($links);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement