SHOW:
|
|
- or go back to the newest paste.
1 | <?php | |
2 | error_reporting(-1); | |
3 | ini_set('display_errors', true); | |
4 | stream_context_set_default( | |
5 | array( | |
6 | 'http' => array( | |
7 | 'method' => 'HEAD' | |
8 | ) | |
9 | ) | |
10 | ); | |
11 | ||
12 | $links = []; | |
13 | $checked = []; | |
14 | - | loadRecursive(getMainUrl(), $links, $checked); |
14 | + | loadRecursive(getMainUrl(), $links); |
15 | ||
16 | var_dump($links); | |
17 | function getMainUrl() | |
18 | { | |
19 | return 'http://theory.phphtml.net'; | |
20 | } | |
21 | ||
22 | - | function loadRecursive1($url, array &$found, array &$checked) |
22 | + | function loadRecursive1($url, array &$found) |
23 | { | |
24 | ||
25 | echo "Parsing " . $url . " <br/>"; | |
26 | - | $checked[] = $url; |
26 | + | |
27 | $html = loadHTML($url); | |
28 | $links = findLinks($html); | |
29 | $counter = 0; | |
30 | foreach ($links as $link) { | |
31 | ||
32 | $link = getMainUrl() . $link; | |
33 | $linkKey = md5($link); | |
34 | if ($link === $url) { | |
35 | continue; | |
36 | } | |
37 | ||
38 | $found[$linkKey] = $link; | |
39 | $counter++; | |
40 | ||
41 | - | // loadRecursive1($link, $found, $checked); |
41 | + | |
42 | } | |
43 | echo "Found " . $counter . " links on " . $url . "<hr/>"; | |
44 | } | |
45 | ||
46 | ||
47 | function loadRecursive($url, array &$found) | |
48 | - | function loadRecursive($url, array &$found, array &$checked) |
48 | + | |
49 | ||
50 | echo "Parsing " . $url . " <br/>"; | |
51 | ||
52 | - | $checked[] = $url; |
52 | + | |
53 | $html = loadHTML($url); | |
54 | $links = findLinks($html); | |
55 | $counter = 0; | |
56 | foreach ($links as $link) { | |
57 | ||
58 | $link = getMainUrl() . $link; | |
59 | $linkKey = md5($link); | |
60 | if ($link === $url) { | |
61 | continue; | |
62 | } | |
63 | ||
64 | $found[$linkKey] = $link; | |
65 | $counter++; | |
66 | } | |
67 | foreach ($found as $link) { | |
68 | loadRecursive1($link, $found); | |
69 | - | loadRecursive1($link, $found, $checked); |
69 | + | |
70 | ||
71 | echo "Found " . $counter . " links on " . $url . "<hr/>"; | |
72 | } | |
73 | ||
74 | function urlIsOk($url) | |
75 | { | |
76 | ||
77 | $headers = @get_headers($url, 1); | |
78 | if (!$headers) { | |
79 | return false; | |
80 | } | |
81 | if (!preg_match("~(\\d{3})~", $headers[0], $matches)) { | |
82 | return false; | |
83 | } | |
84 | $statusCode = (int)$matches[0]; | |
85 | if ($statusCode < 200 || $statusCode > 400) { | |
86 | return false; | |
87 | } | |
88 | if ($headers['Content-Type'] !== 'text/html') { | |
89 | return false; | |
90 | } | |
91 | ||
92 | return true; | |
93 | } | |
94 | ||
95 | function loadHTML($url) | |
96 | { | |
97 | if (!urlIsOk($url)) { | |
98 | return ''; | |
99 | } | |
100 | $curl = curl_init(); | |
101 | curl_setopt($curl, CURLOPT_URL, $url); | |
102 | curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); | |
103 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); | |
104 | curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0); | |
105 | curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0); | |
106 | $result = curl_exec($curl); | |
107 | curl_close($curl); | |
108 | if ($result === false) { | |
109 | //echo 'broken url'; | |
110 | return ''; | |
111 | } | |
112 | ||
113 | return (string)$result; | |
114 | } | |
115 | ||
116 | function findLinks($html) | |
117 | { | |
118 | $domDocument = new DOMDocument(); | |
119 | @$domDocument->loadHTML($html); | |
120 | ||
121 | $xpath = new DOMXPath($domDocument); | |
122 | ||
123 | /** | |
124 | * @var DOMNodeList $elements | |
125 | */ | |
126 | $elements = $xpath->query('//a[@href]'); //найдти все елементы которыи имеют href аттрибут | |
127 | $links = []; | |
128 | ||
129 | foreach ($elements as $element) { | |
130 | $href = $element->getAttribute('href'); | |
131 | if ($href === '') { | |
132 | continue; | |
133 | } | |
134 | $isAbsolute = strpos($href, 'http') !== false; | |
135 | $isAnchor = strpos($href, '#') !== false; | |
136 | if ($isAbsolute || $isAnchor) { | |
137 | continue; | |
138 | } | |
139 | $links[] = $href; | |
140 | } | |
141 | ||
142 | return array_unique($links); | |
143 | } |