Advertisement
opsftw

Web Crawler - with email scraping and proxy support

Aug 11th, 2013
531
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 7.99 KB | None | 0 0
  1. <?php
  2. class crawler
  3. {
  4.     /* * * * * * * * *\
  5.      * Author: Tux.
  6.      *
  7.      * Usage
  8.      * -----------
  9.      * $foo = new crawler('FULL_URL_HERE','BASE_URL',DPETH,GET_EMAILS,STAY_ON_SAME_DOMAIN);
  10.      *
  11.      * EXAMPLE:
  12.      * $foo = new crawler('http://bostonherald.com/about/contact','bostonherald.com',2,true,true);
  13.      *
  14.      * TO EXECUTE:
  15.      * $foo->init()
  16.      *
  17.      * OTHER FUNCTIONS:
  18.      * ---------------
  19.      *
  20.      *  SET A PROXY:
  21.      * $foo->set_proxy('PROXY_IP','PORT'); //if you want a proxy (optional)
  22.      *
  23.      * CHANGE URL WITHOUT CREATING A NER OBJECT:
  24.      * $src = $foo->getSource("URL_HERE");
  25.      *
  26.      * GET EMAIL LIST IF YOU SET EMAIL SCRAPING TO TRUE:
  27.      * # $foo->parseHTML($src,'email'));
  28.      *
  29.      * DUMP ANY ERRORS:
  30.      * $foo->getErrors());
  31.      *
  32.      * QUICKLY PARSE A WEBPAGE FOR URLS AND RETURN THEM:
  33.      * crawler::parseHTML($html);
  34.      */
  35.     # set the variables
  36.    private $url,$errors = array(),$proxy = array(0),$emails;
  37.     public  $userAgent,$sameDomain,$depth,$type;
  38.    
  39.     #iniciate the class (requires url)
  40.    public function __construct($url,$domain,$depth=1,$emails=0,$sameDomain=0,$type='url',$userAgent='Googlebot/2.1 (http://www.googlebot.com/bot.html)') {
  41.         $this->setUrl($url);
  42.         $this->emails     = $emails;
  43.         $this->type       = $type;
  44.         $this->userAgent  = $userAgent;
  45.         $this->sameDomain = $sameDomain;
  46.         $this->depth      = $depth;
  47.         $this->domain     = $domain;
  48.     }
  49.    
  50.     # # # # # # # # # # # # # # # # # # #
  51.    # status and information displaying #
  52.    # # # # # # # # # # # # # # # # # # #
  53.    
  54.     public function getErrors() {
  55.         return $this->errors;
  56.     }
  57.    
  58.     # # # # # # # # # # # # # # #
  59.    # set parrameter functions  #
  60.    # # # # # # # # # # # # # # #
  61.    
  62.     # set url or set error
  63.    private function setUrl($url) {
  64.         ( ( $this->is_url($url) ) ? ( ( $this->url_exists($url) ) ? $this->url = $url : $this->errors['url'] .= "error: url not reachable.\n" ) : $this->errors['url'] .= "error: malformed url.\n" );
  65.     }
  66.    
  67.     public function set_proxy($ip,$port) {
  68.         if($this->check_proxy($ip,$port)) {
  69.             $this->proxy[0] = 1;
  70.             $this->proxy[1] = "$ip:$port";
  71.         } else {
  72.             $this->errors['proxy'] .= "Could not connect to given proxy.\n";
  73.         }
  74.     }
  75.    
  76.     private function check_proxy($ip,$port) {
  77.         $ch = curl_init('http://api.proxyipchecker.com/pchk.php');
  78.         curl_setopt($ch, CURLOPT_POST, 1);
  79.         curl_setopt($ch, CURLOPT_POSTFIELDS,'ip='.$ip.'&port='.$port);
  80.         curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  81.         curl_setopt($ch, CURLOPT_HEADER, 0);
  82.         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  83.         list($res_time, $speed, $country, $type) = explode(';', curl_exec($ch));
  84.         return ( ($country) ? true : false );
  85.     }
  86.    
  87.     # # # # # # # # #
  88.    # url functions #
  89.    # # # # # # # # #
  90.    
  91.     #verrify url syntax
  92.    private function is_url($url) {
  93.         return filter_var($url, FILTER_VALIDATE_URL);
  94.     }
  95.    
  96.     # verrify server can access the url
  97.    private function url_exists($url){
  98.         $resURL = curl_init();
  99.         curl_setopt($resURL, CURLOPT_URL, $url);
  100.         curl_setopt($resURL, CURLOPT_BINARYTRANSFER, 1);
  101.         curl_setopt($resURL, CURLOPT_HEADERFUNCTION, 'curlHeaderCallback');
  102.         curl_setopt($resURL, CURLOPT_FAILONERROR, 1);
  103.         curl_setopt($resURL, CURLOPT_USERAGENT,$this->userAgent);
  104.         curl_setopt($resUR, CURLOPT_TIMEOUT, 5);
  105.         curl_exec ($resURL);
  106.         $intReturnCode = curl_getinfo($resURL, CURLINFO_HTTP_CODE);
  107.         curl_close ($resURL);
  108.         return ( ($intReturnCode != 200 && $intReturnCode != 302 && $intReturnCode != 304) ? 0 : 1 );
  109.     }
  110.    
  111.     # # # # # # # # # # # # # # # # # #
  112.    # crawling and parsing functions  #
  113.    # # # # # # # # # # # # # # # # # #
  114.    
  115.     # fetch a pages source
  116.    public function getSource($url='',$curlMaxExecTime=5) {
  117.         $url = ( ($url=='') ? (($this->url=='') ? 'bad' : $this->url ) : $url );
  118.         if(is_numeric($curlMaxExecTime) && $this->is_url($url) && $this->url_exists($url) ) {
  119.             $ch = curl_init();
  120.             if($this->proxy[0]) {
  121.                 curl_setopt($ch, CURLOPT_PROXY, $this->proxy[1]);
  122.             }
  123.             curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  124.             curl_setopt($ch, CURLOPT_URL,$url);
  125.             curl_setopt($ch, CURLOPT_FAILONERROR, true);
  126.             curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  127.             curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  128.             curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
  129.             curl_setopt($ch, CURLOPT_TIMEOUT, $curlMaxExecTime);
  130.             $html = curl_exec($ch);
  131.             if(!$html) {
  132.                 $this->errors['src'][$url] = "\$could not fetch source.\n";
  133.             } else {
  134.                 return $html;
  135.             }
  136.         } else {
  137.             if(!is_numeric($curlMaxExecTime)) $this->errors['src']['settings'] .= "\$curlMaxExecTime must be numeric.\n";
  138.             if(!$this->is_url($url) || !$this->url_exists($url)) $this->errors['src']['url'] .= "\$url must be a valed url.\n";
  139.         }
  140.     }
  141.    
  142.     # fetch URLs from html and return array
  143.    public function parseHTML($html,$data='url') {
  144.         if($data == 'email') {
  145.             preg_match_all('/[a-z0-9_\-\+]+@[a-z0-9\-]+\.([a-z]{2,3})(?:\.[a-z]{2})?/i', $html, $matches);
  146.             return $matches[0];
  147.         } elseif($data == 'google'){
  148.             preg_match_all('/a href="([^"]+)" class=l.+?>.+?<\/a>/', $html, $matches);
  149.             return $matches[0];
  150.         } else {
  151.             # url pattern
  152.            $dom = new DOMDocument;
  153.             $dom->loadHTML($html);
  154.             $links = $dom->getElementsByTagName('a');
  155.             $ret = array();
  156.             foreach ($links as $link){
  157.                 array_push($ret,$link->getAttribute('href'));
  158.             }
  159.             return $ret;
  160.         }
  161.     }
  162.     public function init() {
  163.         $depth      = $this->depth-1;
  164.         $url        = $this->url;
  165.         $domain     = $this->domain;
  166.         $sameDomain = $this->sameDomain;
  167.         $i          = 0;
  168.         $urls       = array($url);
  169.         $data       = array($url);
  170.         $elist      = array();
  171.         while($i<=$depth) {
  172.             $tmp = array();
  173.             foreach($urls as $u) {
  174.                 array_push($data,$u);
  175.                 $html       = $this->getSource($u);
  176.                 $xyz       = $this->parseHTML($html,$this->type);
  177.                 if($this->emails) {
  178.                     $xxxx = $this->parseHTML($html,'email');
  179.                     $elist = array_merge($elist,array_unique($xxxx));
  180.                 }
  181.                 $tmp = array_merge($tmp, array_unique($xyz));
  182.             }
  183.             if($sameDomain) {
  184.                 $jar = array();
  185.                 foreach($tmp as $x) {
  186.                     $tld_regex = '/\.(com|org|net|me|in|io|cm|uk|biz|ly|tk|edu|gov|mil|info|xxx|pw|ws|ru|ro|asia|us|se)(\.|\/|)/i';
  187.                     if(preg_match("/$domain/i",$x) || !preg_match('/(\:\/\/|www\.)/i',$x) && !preg_match($tld_regex,$x) ) array_push($jar,$x);
  188.                 }
  189.                 $tmp = $jar;
  190.             }
  191.             $urls = $tmp;
  192.             $i++;
  193.         }
  194.         $tmp = array();
  195.         foreach($data as $x) {
  196.             $tld_regex = '/\.(com|org|net|me|in|io|cm|uk|biz|ly|tk|edu|gov|mil|info|xxx|pw|ws|ru|ro|asia|us|se)(\.|\/|)/i';
  197.             if(!preg_match("/$domain/i",$x) && !preg_match('/(\:\/\/|www\.)/i',$x) && !preg_match($tld_regex,$x)) $x = "http://$domain/$x";
  198.             if(!preg_match('/(htt|ft)p(s|)\:\/\//i',$x)) $x = "http://$x";
  199.             $x = str_replace('////','//',$x);
  200.             $x = str_replace('///','//',$x);
  201.             array_push($tmp,$x);
  202.         }
  203.         $hacky['urls'] = array_unique($tmp);
  204.         if($this->emails) {
  205.             $hacky['emails'] = $elist;
  206.         }
  207.         return $hacky;
  208.     }
  209. }
  210. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement