Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- // Error supressing and extend maximum execution time
- error_reporting(0);
- ini_set('max_execution_time', 50000);
- // Sitemap URL List
- $all_activity_urls = array();
- $sitemap_url = array(
- 'https://www.groupon.de/sitemaps/deals-local0.xml.gz'
- );
- $cookies = Array();
- // looping through sitemap url for scraping activity urls
- for ($u = 0; $u < count($sitemap_url); $u++)
- {
- $ch1 = curl_init();
- curl_setopt($ch1, CURLOPT_RETURNTRANSFER, TRUE);
- curl_setopt($ch1, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0');
- curl_setopt($ch1, CURLOPT_REFERER, "https://www.groupon.de/");
- curl_setopt($ch1, CURLOPT_TIMEOUT, 40);
- // curl_setopt($ch1, CURLOPT_COOKIEFILE, "cookie.txt");
- curl_setopt($ch1, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch1, CURLOPT_URL, $sitemap_url[$u]);
- curl_setopt($ch1, CURLOPT_SSL_VERIFYPEER, FALSE);
- // Parsing Cookie from the response header
- curl_setopt($ch1, CURLOPT_HEADERFUNCTION, "curlResponseHeaderCallback");
- $activity_url_source = curl_exec($ch1);
- $status_code = curl_getinfo($ch1, CURLINFO_HTTP_CODE);
- curl_close($ch1);
- if ($status_code === 200)
- {
- // Parsing XML sitemap for activity urls
- $activity_url_list = json_decode(json_encode(simplexml_load_string($activity_url_source)));
- for ($a = 0; $a < count($activity_url_list->url); $a++)
- {
- array_push($all_activity_urls, $activity_url_list->url[$a]->loc);
- }
- }
- }
- $all_activity_urls = array(
- 'https://www.groupon.com/deals/gs-c-c-cable-knit-fuzzy-lined-head-wrap-headband-ear-warmer'
- );
- if (count($all_activity_urls) > 0)
- {
- // URL Loop count
- $loop_from = 0;
- $loop_to = (count($all_activity_urls) > 0) ? 100 : 0;
- // $loop_to = count($all_activity_urls);
- $final_data = array();
- echo 'script start - ' . date('h:i:s') . "<br>";
- for ($u = $loop_from; $u < $loop_to; $u++)
- {
- //Pull source from webpage
- $headers = array(
- 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'accept-language: en-US,en;q=0.9,bn-BD;q=0.8,bn;q=0.7,it;q=0.6',
- 'cache-control: max-age=0',
- 'cookie: ' . implode('; ', $cookies),
- 'upgrade-insecure-requests: 1',
- 'user-agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
- );
- $site = $all_activity_urls[$u];
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
- curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
- curl_setopt($ch, CURLOPT_REFERER, "https://www.groupon.de/");
- curl_setopt($ch, CURLOPT_TIMEOUT, 40);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_URL, $site);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- // Parsing Cookie from the response header
- curl_setopt($ch, CURLOPT_HEADERFUNCTION, "curlResponseHeaderCallback");
- $data = curl_exec($ch);
- $status_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- curl_close($ch);
- if ($status_code === 200)
- {
- // Ready data for parsing
- $document = new DOMDocument();
- $document->loadHTML('<meta http-equiv="content-type" content="text/html; charset=utf-8">' . $data);
- $xpath = new DOMXpath($document);
- $title = '';
- $price = '';
- $base_price = '';
- $link = '';
- $image = '';
- $link = $all_activity_urls[$u];
- // Scraping Availability
- $raw_availability = $xpath->query('//div[@data-bhw="DealHighlights"]/div[0]/div/div');
- $availability = $raw_availability->item(0)->nodeValue;
- $raw_title = $xpath->query('//h1[@id="deal-title"]');
- $title = $raw_title->item(0)->nodeValue;
- // Scraping Price
- $raw_price = $xpath->query('//div[@class="price-discount-wrapper"]');
- $price = trim(str_replace(array("$", "€", "US", " "), array("", "", "", ""), $raw_price->item(0)->nodeValue));
- // Scraping Old Price
- $raw_base_price = $xpath->query('//div[contains(@class, "value-source-wrapper")]');
- $base_price = trim(str_replace(array("$", "€", "US", " "), array("", "", "", ""), $raw_base_price->item(0)->nodeValue));
- // Creating Final Data Array
- array_push($final_data, array(
- 'link' => $link,
- 'availability' => $availability,
- 'name' => $title,
- 'price' => $price,
- 'baseprice' => $base_price,
- 'img' => $image,
- ));
- }
- else
- {
- $link = $all_activity_urls[$u];
- if ($status_code === 429)
- {
- $status_msg = ' - Too Many Requests';
- }
- else
- {
- $status_msg = '';
- }
- array_push($final_data, array(
- 'link' => $link,
- 'status' => $status_code . $status_msg,
- ));
- }
- echo 'before break - ' . date('h:i:s') . "<br>";
- sleep(5);
- echo 'after break - ' . date('h:i:s') . "<br>";
- flush();
- }
- echo 'script end - ' . date('h:i:s') . "<br>";
- // Converting data to XML
- $activities = new SimpleXMLElement("<?xml version=\"1.0\"?><activities></activities>");
- array_to_xml($final_data, $activities);
- $xml_file = $activities->asXML('activities.xml');
- if ($xml_file)
- {
- echo 'XML file have been generated successfully.';
- }
- else
- {
- echo 'XML file generation error.';
- }
- }
- else
- {
- $activities = new SimpleXMLElement("<?xml version=\"1.0\"?><activities></activities>");
- $activities->addChild("error", htmlspecialchars("No URL scraped from sitemap. Stoping script."));
- $xml_file = $activities->asXML('activities.xml');
- if ($xml_file)
- {
- echo 'XML file have been generated successfully.';
- }
- else
- {
- echo 'XML file generation error.';
- }
- }
- // Recursive Function for creating XML Nodes
- function array_to_xml($array, &$activities)
- {
- foreach ($array as $key => $value)
- {
- if (is_array($value))
- {
- if (!is_numeric($key))
- {
- $subnode = $activities->addChild("$key");
- array_to_xml($value, $subnode);
- }
- else
- {
- $subnode = $activities->addChild("activity");
- array_to_xml($value, $subnode);
- }
- }
- else
- {
- $activities->addChild("$key", htmlspecialchars("$value"));
- }
- }
- }
- // Cookie Parsing Function
- function curlResponseHeaderCallback($ch, $headerLine)
- {
- global $cookies;
- if (preg_match('/^Set-Cookie:\s*([^;]*)/mi', $headerLine, $cookie) == 1)
- {
- $cookies[] = $cookie[1];
- }
- return strlen($headerLine); // Needed by curl
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement