Advertisement
kwasinski

MyScrapper.php

Feb 28th, 2015
324
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 2.20 KB | None | 0 0
  1. <?php
  2. class MyScrapper {
  3.     var $file_scrapped_filename = 'megasena.html';
  4.     var $url = '';
  5.  
  6.     function __construct($url = false) {
  7.         if (!$url) return false;
  8.         $this->set_url($url);
  9.     }
  10.  
  11.  
  12.     /*
  13.         string $url
  14.         return array [scrapped page, filetime]
  15.     */
  16.     public function curl_scrap() {
  17.         if (!isset($this->url))  return false;
  18.  
  19.         $curl_session = curl_init($this->url);
  20.  
  21.         // CURLOPT_RETURNTRANSFER this option means that the curl_exec must put the output in a var instead of only printing it.
  22.         curl_setopt($curl_session, CURLOPT_RETURNTRANSFER, true);
  23.  
  24.         // CURLOPT_FILETIME this option means that the scraping will occour retrieving the modification time of the remote file scrapped
  25.         curl_setopt($curl_session, CURLINFO_FILETIME, true);
  26.        
  27.         $scraped_page = curl_exec($curl_session);
  28.         if (curl_errno($curl_session))
  29.             //kill the script and throw an error if there is an error in curl_exec;
  30.             die('An error occoured while scraping: '.$this->url.' ERROR::'. curl_error($curl_session)."\n");
  31.  
  32.         $filetime = curl_getinfo($curl_session, CURLINFO_FILETIME); //unix time or -1 if undef
  33.         curl_close($curl_session);
  34.  
  35.         return $scraped_page? array(
  36.                     'scraped_page' => $scraped_page,
  37.                     'filetime' => $filetime,
  38.                 ):
  39.                 false;
  40.     }
  41.  
  42.     /*
  43.         string $scraped_page
  44.         int filetime
  45.  
  46.         bool return [true] if the file do exist or [false] if it does not
  47.     */
  48.     public function check_n_save_page($scraped_page, $filetime) {
  49.         if (!$scraped_page)  return false;
  50.  
  51.         // file time returns unix time, knowing that you can just compare with the [CURLINFO_FILETIME] runnning out from the function date()
  52.         if (!file_exists($this->file_scrapped_filename) || $filetime > filemtime($this->file_scrapped_filename)) {
  53.             clearstatcache(); // filetime() stores his results in aplication chace, we must clean it out.
  54.  
  55.             $file_handle = fopen($this->file_scrapped_filename, 'w');
  56.             fwrite($file_handle, $scraped_page);
  57.             print 'File created'."\n";
  58.         }
  59.  
  60.         return file_exists($this->file_scrapped_filename)? true: false;
  61.     }
  62.  
  63.     public function set_url($url) {
  64.         $this->url = $url;
  65.         return true;
  66.     }
  67.  
  68.  
  69.  
  70. }
  71.  
  72. //not closing the PHP tag coz while handling large files it can cause problem in a buffer, dont ask me why
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement