View difference between Paste ID: NMZsvTP2 and e0wY8crW
SHOW: | | - or go back to the newest paste.
1
<?php
2
error_reporting(-1);
3
ini_set('display_errors', true);
4
stream_context_set_default(
5
    array(
6
        'http' => array(
7
            'method' => 'HEAD'
8
        )
9
    )
10
);
11
12
$links = [];
13
$checked = [];
14-
loadRecursive(getMainUrl(), $links, $checked);
14+
loadRecursive(getMainUrl(), $links);
15
16
var_dump($links);
17
function getMainUrl()
18
{
19
    return 'http://theory.phphtml.net';
20
}
21
22-
function loadRecursive1($url, array &$found, array &$checked)
22+
function loadRecursive1($url, array &$found)
23
{
24
25
    echo "Parsing " . $url . " <br/>";
26-
    $checked[] = $url;
26+
27
    $html = loadHTML($url);
28
    $links = findLinks($html);
29
    $counter = 0;
30
    foreach ($links as $link) {
31
32
        $link = getMainUrl() . $link;
33
        $linkKey = md5($link);
34
        if ($link === $url) {
35
            continue;
36
        }
37
38
        $found[$linkKey] = $link;
39
        $counter++;
40
41-
        //  loadRecursive1($link, $found, $checked);
41+
42
    }
43
    echo "Found " . $counter . " links on " . $url . "<hr/>";
44
}
45
46
47
function loadRecursive($url, array &$found)
48-
function loadRecursive($url, array &$found, array &$checked)
48+
49
50
    echo "Parsing " . $url . " <br/>";
51
52-
    $checked[] = $url;
52+
53
    $html = loadHTML($url);
54
    $links = findLinks($html);
55
    $counter = 0;
56
    foreach ($links as $link) {
57
58
        $link = getMainUrl() . $link;
59
        $linkKey = md5($link);
60
        if ($link === $url) {
61
            continue;
62
        }
63
64
        $found[$linkKey] = $link;
65
        $counter++;
66
    }
67
    foreach ($found as $link) {
68
        loadRecursive1($link, $found);
69-
        loadRecursive1($link, $found, $checked);
69+
70
71
    echo "Found " . $counter . " links on " . $url . "<hr/>";
72
}
73
74
function urlIsOk($url)
75
{
76
77
    $headers = @get_headers($url, 1);
78
    if (!$headers) {
79
        return false;
80
    }
81
    if (!preg_match("~(\\d{3})~", $headers[0], $matches)) {
82
        return false;
83
    }
84
    $statusCode = (int)$matches[0];
85
    if ($statusCode < 200 || $statusCode > 400) {
86
        return false;
87
    }
88
    if ($headers['Content-Type'] !== 'text/html') {
89
        return false;
90
    }
91
92
    return true;
93
}
94
95
function loadHTML($url)
96
{
97
    if (!urlIsOk($url)) {
98
        return '';
99
    }
100
    $curl = curl_init();
101
    curl_setopt($curl, CURLOPT_URL, $url);
102
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
103
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
104
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0);
105
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
106
    $result = curl_exec($curl);
107
    curl_close($curl);
108
    if ($result === false) {
109
        //echo 'broken url';
110
        return '';
111
    }
112
113
    return (string)$result;
114
}
115
116
function findLinks($html)
117
{
118
    $domDocument = new DOMDocument();
119
    @$domDocument->loadHTML($html);
120
121
    $xpath = new DOMXPath($domDocument);
122
123
    /**
124
     * @var DOMNodeList $elements
125
     */
126
    $elements = $xpath->query('//a[@href]'); //найдти все елементы которыи имеют href аттрибут
127
    $links = [];
128
129
    foreach ($elements as $element) {
130
        $href = $element->getAttribute('href');
131
        if ($href === '') {
132
            continue;
133
        }
134
        $isAbsolute = strpos($href, 'http') !== false;
135
        $isAnchor = strpos($href, '#') !== false;
136
        if ($isAbsolute || $isAnchor) {
137
            continue;
138
        }
139
        $links[] = $href;
140
    }
141
142
    return array_unique($links);
143
}