Advertisement
Guest User

Group on Curl

a guest
Jan 14th, 2019
517
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 7.50 KB | None | 0 0
  1. <?php
  2.  
  3. // Error supressing and extend maximum execution time
  4. error_reporting(0);
  5. ini_set('max_execution_time', 50000);
  6.  
  7. // Sitemap URL List
  8. $all_activity_urls = array();
  9. $sitemap_url = array(
  10.      'https://www.groupon.de/sitemaps/deals-local0.xml.gz'
  11. );
  12. $cookies = Array();
  13.  
  14. // looping through sitemap url for scraping activity urls
  15. for ($u = 0; $u < count($sitemap_url); $u++)
  16. {
  17.      $ch1 = curl_init();
  18.      curl_setopt($ch1, CURLOPT_RETURNTRANSFER, TRUE);
  19.      curl_setopt($ch1, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0');
  20.      curl_setopt($ch1, CURLOPT_REFERER, "https://www.groupon.de/");
  21.      curl_setopt($ch1, CURLOPT_TIMEOUT, 40);
  22. //    curl_setopt($ch1, CURLOPT_COOKIEFILE, "cookie.txt");
  23.      curl_setopt($ch1, CURLOPT_RETURNTRANSFER, true);
  24.      curl_setopt($ch1, CURLOPT_URL, $sitemap_url[$u]);
  25.      curl_setopt($ch1, CURLOPT_SSL_VERIFYPEER, FALSE);
  26.      // Parsing Cookie from the response header
  27.      curl_setopt($ch1, CURLOPT_HEADERFUNCTION, "curlResponseHeaderCallback");
  28.      $activity_url_source = curl_exec($ch1);
  29.      $status_code = curl_getinfo($ch1, CURLINFO_HTTP_CODE);
  30.      curl_close($ch1);
  31.  
  32.      if ($status_code === 200)
  33.      {
  34.           // Parsing XML sitemap for activity urls
  35.           $activity_url_list = json_decode(json_encode(simplexml_load_string($activity_url_source)));
  36.           for ($a = 0; $a < count($activity_url_list->url); $a++)
  37.           {
  38.                array_push($all_activity_urls, $activity_url_list->url[$a]->loc);
  39.           }
  40.      }
  41. }
  42.  
  43. $all_activity_urls = array(
  44.      'https://www.groupon.com/deals/gs-c-c-cable-knit-fuzzy-lined-head-wrap-headband-ear-warmer'
  45. );
  46.  
  47. if (count($all_activity_urls) > 0)
  48. {
  49. // URL Loop count
  50.      $loop_from = 0;
  51.      $loop_to = (count($all_activity_urls) > 0) ? 100 : 0;
  52. //    $loop_to = count($all_activity_urls);
  53.  
  54.      $final_data = array();
  55.      echo 'script start - ' . date('h:i:s') . "<br>";
  56.  
  57.      for ($u = $loop_from; $u < $loop_to; $u++)
  58.      {
  59.           //Pull source from webpage
  60.           $headers = array(
  61.                'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
  62.                'accept-language: en-US,en;q=0.9,bn-BD;q=0.8,bn;q=0.7,it;q=0.6',
  63.                'cache-control: max-age=0',
  64.                'cookie: ' . implode('; ', $cookies),
  65.                'upgrade-insecure-requests: 1',
  66.                'user-agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
  67.           );
  68.  
  69.           $site = $all_activity_urls[$u];
  70.           $ch = curl_init();
  71.           curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
  72.           curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
  73.           curl_setopt($ch, CURLOPT_REFERER, "https://www.groupon.de/");
  74.           curl_setopt($ch, CURLOPT_TIMEOUT, 40);
  75.           curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  76.           curl_setopt($ch, CURLOPT_URL, $site);
  77.           curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
  78.           // Parsing Cookie from the response header
  79.           curl_setopt($ch, CURLOPT_HEADERFUNCTION, "curlResponseHeaderCallback");
  80.           $data = curl_exec($ch);
  81.           $status_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  82.           curl_close($ch);
  83.  
  84.           if ($status_code === 200)
  85.           {
  86.                // Ready data for parsing
  87.                $document = new DOMDocument();
  88.                $document->loadHTML('<meta http-equiv="content-type" content="text/html; charset=utf-8">' . $data);
  89.                $xpath = new DOMXpath($document);
  90.  
  91.                $title = '';
  92.                $price = '';
  93.                $base_price = '';
  94.                $link = '';
  95.                $image = '';
  96.  
  97.                $link = $all_activity_urls[$u];
  98.  
  99.                // Scraping Availability
  100.                $raw_availability = $xpath->query('//div[@data-bhw="DealHighlights"]/div[0]/div/div');
  101.                $availability = $raw_availability->item(0)->nodeValue;
  102.  
  103.                $raw_title = $xpath->query('//h1[@id="deal-title"]');
  104.                $title = $raw_title->item(0)->nodeValue;
  105.  
  106.                // Scraping Price
  107.                $raw_price = $xpath->query('//div[@class="price-discount-wrapper"]');
  108.                $price = trim(str_replace(array("$", "€", "US", "&nbsp;"), array("", "", "", ""), $raw_price->item(0)->nodeValue));
  109.  
  110.                // Scraping Old Price
  111.                $raw_base_price = $xpath->query('//div[contains(@class, "value-source-wrapper")]');
  112.                $base_price = trim(str_replace(array("$", "€", "US", "&nbsp;"), array("", "", "", ""), $raw_base_price->item(0)->nodeValue));
  113.  
  114.                // Creating Final Data Array
  115.                array_push($final_data, array(
  116.                     'link' => $link,
  117.                     'availability' => $availability,
  118.                     'name' => $title,
  119.                     'price' => $price,
  120.                     'baseprice' => $base_price,
  121.                     'img' => $image,
  122.                ));
  123.           }
  124.           else
  125.           {
  126.                $link = $all_activity_urls[$u];
  127.                if ($status_code === 429)
  128.                {
  129.                     $status_msg = ' - Too Many Requests';
  130.                }
  131.                else
  132.                {
  133.                     $status_msg = '';
  134.                }
  135.  
  136.                array_push($final_data, array(
  137.                     'link' => $link,
  138.                     'status' => $status_code . $status_msg,
  139.                ));
  140.           }
  141.           echo 'before break - ' . date('h:i:s') . "<br>";
  142.           sleep(5);
  143.           echo 'after break - ' . date('h:i:s') . "<br>";
  144.           flush();
  145.      }
  146.      echo 'script end - ' . date('h:i:s') . "<br>";
  147.      // Converting data to XML
  148.      $activities = new SimpleXMLElement("<?xml version=\"1.0\"?><activities></activities>");
  149.      array_to_xml($final_data, $activities);
  150.      $xml_file = $activities->asXML('activities.xml');
  151.      if ($xml_file)
  152.      {
  153.           echo 'XML file have been generated successfully.';
  154.      }
  155.      else
  156.      {
  157.           echo 'XML file generation error.';
  158.      }
  159. }
  160. else
  161. {
  162.      $activities = new SimpleXMLElement("<?xml version=\"1.0\"?><activities></activities>");
  163.      $activities->addChild("error", htmlspecialchars("No URL scraped from sitemap. Stoping script."));
  164.      $xml_file = $activities->asXML('activities.xml');
  165.      if ($xml_file)
  166.      {
  167.           echo 'XML file have been generated successfully.';
  168.      }
  169.      else
  170.      {
  171.           echo 'XML file generation error.';
  172.      }
  173. }
  174.  
  175. // Recursive Function for creating XML Nodes
  176. function array_to_xml($array, &$activities)
  177. {
  178.      foreach ($array as $key => $value)
  179.      {
  180.           if (is_array($value))
  181.           {
  182.                if (!is_numeric($key))
  183.                {
  184.                     $subnode = $activities->addChild("$key");
  185.                     array_to_xml($value, $subnode);
  186.                }
  187.                else
  188.                {
  189.                     $subnode = $activities->addChild("activity");
  190.                     array_to_xml($value, $subnode);
  191.                }
  192.           }
  193.           else
  194.           {
  195.                $activities->addChild("$key", htmlspecialchars("$value"));
  196.           }
  197.      }
  198. }
  199.  
  200. // Cookie Parsing Function
  201. function curlResponseHeaderCallback($ch, $headerLine)
  202. {
  203.      global $cookies;
  204.      if (preg_match('/^Set-Cookie:\s*([^;]*)/mi', $headerLine, $cookie) == 1)
  205.      {
  206.           $cookies[] = $cookie[1];
  207.      }
  208.      return strlen($headerLine); // Needed by curl
  209. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement