Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- # Competitor Intelligence Agent
- # License: Creative Commons CC BY-NC-SA
- #
- # By Michael Gradek, a proud Udacian
- # Twitter: @MichaelGradek
- #
- # Competitor Intelligence Agent aims to collect and display a report with relevant information about Groupon.es, one of the largest Collective Buying sites in Spain (... and in the world!)
- # For more information about Collective Buying please refer to: http://en.wikipedia.org/wiki/Group_buying
- #
- # To keep things simple and easy to understand, my script will only consider Groupon in Spain (groupon.es)
- #
- # Why is this an interesting thing to do?
- # Imagine you are the Business Intelligence director of one of Spain's Collective Buying companies such as Groupalia.com, Letsbonus.com, etc.
- # Wouldn't you love to receive a daily report stating what your competition is up to? i.e.: What deals they have? Whats their average discount? What their sales were yesterday? And even how much they are going to sell this year?!?
- #
- # The goal for this crawler is to crawl the Groupon.es site, collect all the currently posted deals, and then crawl the deals and fetch all information that would be relevant for a competitor's Business Intelligence director
- # Once all the deals have been found and saved, the script can print a report showing the performance of each business area, and even forecast yearly sales!
- #
- # Definitions: Deals: Any sort of coupon sold on the site
- # Local: Coupons from local merchants such as a restaurant, beauty & wellness, services, etc.
- # Travel: Coupons for travelling such as trips, hotels, etc.
- # Shopping: Coupons for products such as phones, devices, watches, etc.
- import urllib2
- # ----- Begin initialize -----
- seed_all = 'http://www.groupon.es/all-deals/oferta-nacional' # This page is only used to get the cities Groupon operates in
- seed_barcelona = 'barcelona' # This page will be used only for testing to reduce amount of pages to request (only Barcelona deals, not whole country)
- cities = [] # List containing the cities in which Groupon.es operates and the relative url to the city deal mini-site
- deals_to_crawl = {'Local': [],
- 'Travel': [],
- 'Shopping': []} # Dictionary with the urls to crawl for each category
- deals = {'Local': [],
- 'Travel': [],
- 'Shopping': []} # Dictionary with deal for each category
- number_cities_crawled = 0. # Keep track of number of cities crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
- total_cities_to_crawl = 0.
- number_deals_crawled = 0. # Keep track of number of deals crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
- total_deals_to_crawl = 0.
- # ----- End initialize -----
- # ----- Begin procedures -----
- def fetch_cities(seed): # Crawl seed page to find all cities Groupon.es operates in
- global total_cities_to_crawl
- def extract_cities(html):
- global total_cities_to_crawl
- code = html
- start_pos = code.find('<div id="citySelectBox" ')
- code = code[start_pos:]
- for i in range(2): # Skip first two links as they are not cities
- start_pos = code.find('<li class')
- skip = len('<li class')
- code = code[start_pos+skip:]
- start_pos = code.find('</li>')
- skip = len('</li>')
- code = code[start_pos+skip:]
- while code.find('</ul>') > 20: # Detect end of list of cities
- total_cities_to_crawl += 1
- # Start fetching cities - Relative URL
- start_pos = code.find("window.location.href = 'http://www.groupon.es/deals/")
- skip = len("window.location.href = 'http://www.groupon.es/deals/")
- code = code[start_pos+skip:]
- end_pos = code.find("';")
- url = code[:end_pos]
- code = code[end_pos:]
- # Fetch city name
- start_pos = code.find('<span>')
- skip = len('<span>')
- code = code[start_pos+skip:]
- end_pos = code.find('</span>')
- skip = len('</span>')
- name = code[:end_pos]
- code = code[end_pos+skip:]
- # Add to dictionary
- cities.append([name, url])
- if seed == seed_barcelona: # use seed_barcelona to reduce page requests and thus take less time to execute!
- cities.append(['Barcelona', seed_barcelona])
- total_cities_to_crawl += 1
- else:
- seed_response = urllib2.urlopen(seed)
- seed_html = seed_response.read()
- extract_cities(seed_html)
- def crawl_city(url): # Crawl cities to find all URLs to all deals
- global number_cities_crawled
- city_response = urllib2.urlopen(url)
- city_html = city_response.read()
- number_cities_crawled += 1
- def crawl_local(html): # Search for all local deals URLs in all cities (or only Barcelona if seed_barcelona was used)
- code = html
- start_pos = code.find('var itemsLocalDeals = [')
- skip = len('var itemsLocalDeals = [')
- code = code[start_pos+skip:]
- while code.find('dealPermaLink":"') < code.find('var itemsShoppingDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'local' category
- start_pos = code.find('dealPermaLink":"')
- skip = len('dealPermaLink":"')
- code = code[start_pos+skip:]
- end_pos = code.find('"')
- url = code[:end_pos]
- deals_to_crawl['Local'].append(url)
- code = code[end_pos:]
- def crawl_shopping(html): # Search for shopping deals URLs only one as they do not depend on city
- code = html
- start_pos = code.find('var itemsShoppingDeals = [')
- skip = len('var itemsShoppingDeals = [')
- code = code[start_pos+skip:]
- while code.find('dealPermaLink":"') < code.find('var itemsTravelDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'shopping' category
- start_pos = code.find('dealPermaLink":"')
- skip = len('dealPermaLink":"')
- code = code[start_pos+skip:]
- end_pos = code.find('"')
- url = code[:end_pos]
- deals_to_crawl['Shopping'].append(url)
- code = code[end_pos:]
- def crawl_travel(html): # Search for travel deals URLs only one as they do not depend on city
- code = html
- start_pos = code.find('var itemsTravelDeals = [')
- skip = len('var itemsTravelDeals = [')
- code = code[start_pos+skip:]
- while code.find('dealPermaLink":"') != -1: # If crawler can't find next 'dealPermaLink', that means there are no deals left
- start_pos = code.find('dealPermaLink":"')
- skip = len('dealPermaLink":"')
- code = code[start_pos+skip:]
- end_pos = code.find('"')
- url = code[:end_pos]
- deals_to_crawl['Travel'].append(url)
- code = code[end_pos:]
- crawl_local(city_html) # Always crawl to fetch all deals from all cities
- if len(deals_to_crawl['Shopping']) == 0: # Crawl only once, as deals from this category are independent from cities
- crawl_shopping(city_html)
- if len(deals_to_crawl['Travel']) == 0: # Crawl only once, as deals from this category are independent from cities
- crawl_travel(city_html)
- print str( round( ( number_cities_crawled / total_cities_to_crawl ) * 100, 2)) + '% complete ...'
- def fetch_deals():
- url_prefix = 'http://www.groupon.es/all-deals/'
- for city in cities:
- deal_url = url_prefix + city[1]
- crawl_city(deal_url)
- def crawl_deals(type):
- global number_deals_crawled
- for i in range(len(deals_to_crawl[type])):
- full_url = 'http://www.groupon.es' + deals_to_crawl[type][i]
- deal_response = urllib2.urlopen(full_url)
- deal_html = deal_response.read()
- number_deals_crawled += 1
- deal = []
- def clean_up_string(string):
- output = ''
- for i in range(len(string)):
- if 47 <= ord(string[i]) <= 57 or 65 <= ord(string[i]) <= 90 or 97 <= ord(string[i]) <= 122 or ord(string[i]) == 44: # Clean up any string from white space and any undesired characters. Allow only Numbers, small-caps and UPPER-CAPS and commas
- if ord(string[i]) == 44:
- output += "."
- else:
- output += string[i]
- return output
- # Find price
- start_pos = deal_html.find('Precio:')
- deal_html = deal_html[start_pos:]
- start_pos = deal_html.find('<span class="noWrap">')
- skip = len('<span class="noWrap">')
- deal_html = deal_html[start_pos+skip:]
- end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
- deal_price = deal_html[:end_pos]
- deal_html = deal_html[end_pos:]
- deal.append(float(clean_up_string(deal_price)))
- # Find discount
- start_pos = deal_html.find('Descuento</td>')
- if start_pos != -1: # Some Shopping deals don't have a discount displayed ... Assumption: Those products simply don't have a discount, therefore the crawler will append a 0
- deal_html = deal_html[start_pos:]
- start_pos = deal_html.find('<td class="col1">')
- skip = len('<td class="col1">')
- deal_html = deal_html[start_pos+skip:]
- end_pos = deal_html.find('%')
- deal_discount = deal_html[:end_pos]
- deal_html = deal_html[end_pos:]
- deal.append(float(clean_up_string(deal_discount)))
- # Find saving
- start_pos = deal_html.find('<td>')
- skip = len('<td>')
- deal_html = deal_html[start_pos+skip:]
- end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
- deal_saving = deal_html[:end_pos]
- deal_html = deal_html[end_pos:]
- deal.append(float(clean_up_string(deal_saving)))
- else:
- deal_discount = 0.
- deal_saving = 0.
- deal.append(deal_discount)
- deal.append(deal_saving)
- # Find number of people who bought coupon
- start_pos = deal_html.find('<span id="jDealSoldAmount">')
- if start_pos != -1: # If this string can't be found, that means that 0 people have bought the deal so far
- skip = len('<span id="jDealSoldAmount">')
- deal_html = deal_html[start_pos+skip:]
- end_pos = deal_html.find('</span>')
- deal_bought = deal_html[:end_pos]
- deal.append(float(clean_up_string(deal_bought)))
- else:
- deal.append(0)
- deal.append(deals_to_crawl[type][i]) # Append deal relative URL as last element of list
- # Add to dictionary
- deals[type].append(deal)
- print str( round( ( number_deals_crawled / total_deals_to_crawl ) * 100, 2)) + '% complete ...'
- def reporting(type, print_forecast = False):
- def forecast(daily_sales):
- # NOTE: This by no means is a very rigorous method of forecasting yearly sales, but is probably the best we can do with 'one-time' data
- # Just imagine what you could do with a slightly more sophisticated robot!!!
- return daily_sales * 365
- def format_number(number):
- string = str(number)
- formatted = ''
- formatted_reverse = ''
- for i in range(len(string)):
- if (i+1) % 3 == 0 and i < len(string)-1:
- formatted += string[len(string)-i-1]+','
- else:
- formatted += string[len(string)-i-1]
- for i in range(len(formatted)):
- formatted_reverse += formatted[len(formatted)-i-1]
- return formatted_reverse
- if type != 'Global':
- number_deals = len(deals[type])
- min_price = 9999999.
- max_price = -1.
- acumulated_price = 0.
- min_discount = 100.
- max_discount = 0.
- acumulated_discount = 0.
- min_saving = 9999999.
- max_saving = -1.
- acumulated_saving = 0.
- total_sales_units = 0.
- total_sales_euros = 0.
- total_saving_euros = 0.
- best_deal = [0., '']
- for i in deals[type]:
- if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
- best_deal[0] = i[0] * i[3]
- best_deal[1] = i[4]
- if i[0] < min_price:
- min_price = i[0]
- if i[0] > max_price:
- max_price = i[0]
- if i[1] < min_discount:
- min_discount = i[1]
- if i[1] > max_discount:
- max_discount = i[1]
- if i[2] < min_saving:
- min_saving = i[2]
- if i[2] > max_saving:
- max_saving = i[2]
- total_sales_units += i[3]
- total_sales_euros += i[0] * i[3]
- total_saving_euros += i[2] * i[3]
- acumulated_price += i[0]
- acumulated_discount += i[1]
- acumulated_saving += i[2]
- avg_deal_price = acumulated_price / number_deals
- avg_deal_discount = acumulated_discount / number_deals
- avg_deal_saving = acumulated_saving / number_deals
- avg_sale_value = total_sales_euros / total_sales_units
- avg_sale_saving = total_saving_euros / total_sales_units
- string = '############# '+type+' #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
- if print_forecast:
- string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
- return string
- else:
- number_deals = len(deals['Shopping']) + len(deals['Travel']) + len(deals['Local'])
- min_price = 9999999.
- max_price = -1.
- acumulated_price = 0.
- min_discount = 100.
- max_discount = 0.
- acumulated_discount = 0.
- min_saving = 9999999.
- max_saving = -1.
- acumulated_saving = 0.
- total_sales_units = 0.
- total_sales_euros = 0.
- total_saving_euros = 0.
- best_deal = [0., '']
- for e in deals:
- for i in deals[e]:
- if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
- best_deal[0] = i[0] * i[3]
- best_deal[1] = i[4]
- if i[0] < min_price:
- min_price = i[0]
- if i[0] > max_price:
- max_price = i[0]
- if i[1] < min_discount:
- min_discount = i[1]
- if i[1] > max_discount:
- max_discount = i[1]
- if i[2] < min_saving:
- min_saving = i[2]
- if i[2] > max_saving:
- max_saving = i[2]
- total_sales_units += i[3]
- total_sales_euros += i[0] * i[3]
- total_saving_euros += i[2] * i[3]
- acumulated_price += i[0]
- acumulated_discount += i[1]
- acumulated_saving += i[2]
- avg_deal_price = acumulated_price / number_deals
- avg_deal_discount = acumulated_discount / number_deals
- avg_deal_saving = acumulated_saving / number_deals
- avg_sale_value = total_sales_euros / total_sales_units
- avg_sale_saving = total_saving_euros / total_sales_units
- string = '############# Global #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
- if print_forecast:
- string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
- return string
- # ----- End procedures -----
- # ----- Begin control panel -----
- # Turn these to True or False depending on which ones you want to crawl. No changes further below are necessary, unless you want to fiddle around with the crawler ;)
- switch_seed_all = False # True: crawls ALL deals on the site; False: crawls only deals in Barcelona. Leave as False for faster execution.
- switch_crawl_shopping = True
- switch_crawl_travel = True
- switch_crawl_local = True
- switch_print_global_report = True # Print a global report of all 3 types of deals (only turn True if previous 3 switches are True)
- switch_print_forecast = True # Print a (very non-scientific) yearly sales forecast
- # ----- End control panel -----
- if switch_seed_all:
- if not switch_crawl_local and not switch_crawl_shopping and not switch_crawl_travel:
- print 'Switches are set to crawl nothing ...'
- else:
- print 'Crawling seed site ...'
- fetch_cities(seed_all)
- print 'Found '+str(len(cities))+' cities to crawl ...'
- fetch_deals()
- total_deals_to_crawl = 0
- if switch_crawl_shopping:
- total_deals_to_crawl += len(deals_to_crawl['Shopping'])
- if switch_crawl_travel:
- total_deals_to_crawl += len(deals_to_crawl['Travel'])
- if switch_crawl_local:
- total_deals_to_crawl += len(deals_to_crawl['Local'])
- print 'Found '+str(total_deals_to_crawl)+' deals to crawl ...'
- if switch_crawl_shopping:
- crawl_deals('Shopping')
- if switch_crawl_travel:
- crawl_deals('Travel')
- if switch_crawl_local:
- crawl_deals('Local')
- if switch_crawl_shopping:
- print reporting('Shopping', switch_print_forecast)
- if switch_crawl_travel:
- print reporting('Travel', switch_print_forecast)
- if switch_crawl_local:
- print reporting('Local', switch_print_forecast)
- if switch_print_global_report:
- print reporting('Global', switch_print_forecast)
- else:
- if not switch_crawl_local and not switch_crawl_shopping and not switch_crawl_travel:
- print 'Switches are set to crawl nothing ...'
- else:
- print 'Crawling seed site ...'
- fetch_cities(seed_barcelona)
- print 'Found '+str(len(cities))+' cities to crawl ...'
- fetch_deals()
- total_deals_to_crawl = 0
- if switch_crawl_shopping:
- total_deals_to_crawl += len(deals_to_crawl['Shopping'])
- if switch_crawl_travel:
- total_deals_to_crawl += len(deals_to_crawl['Travel'])
- if switch_crawl_local:
- total_deals_to_crawl += len(deals_to_crawl['Local'])
- print 'Found '+str(total_deals_to_crawl)+' deals to crawl ...'
- if switch_crawl_shopping:
- crawl_deals('Shopping')
- if switch_crawl_travel:
- crawl_deals('Travel')
- if switch_crawl_local:
- crawl_deals('Local')
- if switch_crawl_shopping:
- print reporting('Shopping', switch_print_forecast)
- if switch_crawl_travel:
- print reporting('Travel', switch_print_forecast)
- if switch_crawl_local:
- print reporting('Local', switch_print_forecast)
- if switch_print_global_report:
- print reporting('Global', switch_print_forecast)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement