Advertisement
Guest User

Untitled

a guest
Apr 30th, 2016
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.87 KB | None | 0 0
  1. <?php
  2. set_time_limit(0);
  3. class crawler
  4. {
  5. protected $_url;
  6. protected $_depth;
  7. protected $_host;
  8. protected $_useHttpAuth = false;
  9. protected $_user;
  10. protected $_pass;
  11. protected $_seen = array();
  12. protected $_filter = array();
  13.  
  14. public function __construct($url, $depth = 5)
  15. {
  16. $this->_url = $url;
  17. $this->_depth = $depth;
  18. $parse = parse_url($url);
  19. $this->_host = $parse['host'];
  20. }
  21.  
  22. protected function _processAnchors($content, $url, $depth)
  23. {
  24. $dom = new DOMDocument('1.0');
  25. @$dom->loadHTML($content);
  26. $anchors = $dom->getElementsByTagName('a');
  27.  
  28. foreach ($anchors as $element) {
  29. $href = $element->getAttribute('href');
  30. if (0 !== strpos($href, 'http')) {
  31. $path = '/' . ltrim($href, '/');
  32. if (extension_loaded('http')) {
  33. $href = http_build_url($url, array('path' => $path));
  34. } else {
  35. $parts = parse_url($url);
  36. $href = $parts['scheme'] . '://';
  37. if (isset($parts['user']) && isset($parts['pass'])) {
  38. $href .= $parts['user'] . ':' . $parts['pass'] . '@';
  39. }
  40. $href .= $parts['host'];
  41. if (isset($parts['port'])) {
  42. $href .= ':' . $parts['port'];
  43. }
  44. $href .= $path;
  45. }
  46. }
  47. // Crawl only link that belongs to the start domain
  48. $this->crawl_page($href, $depth - 1);
  49. }
  50. }
  51.  
  52. protected function _getContent($url)
  53. {
  54. $handle = curl_init($url);
  55. if ($this->_useHttpAuth) {
  56. curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
  57. curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this- >_pass);
  58. }
  59. // follows 302 redirect, creates problem wiht authentication
  60. // curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
  61. // return the content
  62. curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
  63.  
  64. /* Get the HTML or whatever is linked in $url. */
  65. $response = curl_exec($handle);
  66. // response total time
  67. $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
  68. /* Check for 404 (file not found). */
  69. $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
  70.  
  71. curl_close($handle);
  72. return array($response, $httpCode, $time);
  73. }
  74.  
  75. protected function _printResult($url, $depth, $httpcode, $time)
  76. {
  77. ob_end_flush();
  78. $currentDepth = $this->_depth - $depth;
  79. $count = count($this->_seen);
  80. echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>";
  81. ob_start();
  82. flush();
  83. }
  84.  
  85. protected function isValid($url, $depth)
  86. {
  87. if (strpos($url, $this->_host) === false
  88. || $depth === 0
  89. || isset($this->_seen[$url])
  90. ) {
  91. return false;
  92. }
  93. foreach ($this->_filter as $excludePath) {
  94. if (strpos($url, $excludePath) !== false) {
  95. return false;
  96. }
  97. }
  98. return true;
  99. }
  100.  
  101. public function crawl_page($url, $depth)
  102. {
  103. if (!$this->isValid($url, $depth)) {
  104. return;
  105. }
  106. // add to the seen URL
  107. $this->_seen[$url] = true;
  108. // get Content and Return Code
  109. list($content, $httpcode, $time) = $this->_getContent($url);
  110. // print Result for current Page
  111. $this->_printResult($url, $depth, $httpcode, $time);
  112. // process subPages
  113. $this->_processAnchors($content, $url, $depth);
  114. }
  115.  
  116. public function setHttpAuth($user, $pass)
  117. {
  118. $this->_useHttpAuth = true;
  119. $this->_user = $user;
  120. $this->_pass = $pass;
  121. }
  122.  
  123. public function addFilterPath($path)
  124. {
  125. $this->_filter[] = $path;
  126. }
  127.  
  128. public function run()
  129. {
  130. $this->crawl_page($this->_url, $this->_depth);
  131. }
  132. }
  133.  
  134. // USAGE
  135. $startURL = 'http://all-free-download.com/';
  136. $depth = 30;
  137. $username = 'YOURUSER';
  138. $password = 'YOURPASS';
  139. $crawler = new crawler($startURL, $depth);
  140. $crawler->setHttpAuth($username, $password);
  141. // Exclude path with the following structure to be processed
  142. $crawler->addFilterPath('customer/account/login/referer');
  143. $crawler->run();
  144. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement