Advertisement
MikeG

Competitive Intelligence Agent - Michael Gradek

Apr 11th, 2012
424
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 20.71 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. # Competitor Intelligence Agent
  4. # License: Creative Commons CC BY-NC-SA
  5. #
  6. # By Michael Gradek, a proud Udacian
  7. # Twitter: @MichaelGradek
  8. #
  9. # Competitor Intelligence Agent aims to collect and display a report with relevant information about Groupon.es, one of the largest Collective Buying sites in Spain (... and in the world!)
  10. # For more information about Collective Buying please refer to: http://en.wikipedia.org/wiki/Group_buying
  11. #
  12. # To keep things simple and easy to understand, my script will only consider Groupon in Spain (groupon.es)
  13. #
  14. # Why is this an interesting thing to do?
  15. # Imagine you are the Business Intelligence director of one of Spain's Collective Buying companies such as Groupalia.com, Letsbonus.com, etc.
  16. # Wouldn't you love to receive a daily report stating what your competition is up to? i.e.: What deals they have? Whats their average discount? What their sales were yesterday? And even how much they are going to sell this year?!?
  17. #
  18. # The goal for this crawler is to crawl the Groupon.es site, collect all the currently posted deals, and then crawl the deals and fetch all information that would be relevant for a competitor's Business Intelligence director
  19. # Once all the deals have been found and saved, the script can print a report showing the performance of each business area, and even forecast yearly sales!
  20. #
  21. # Definitions:  Deals: Any sort of coupon sold on the site
  22. #               Local: Coupons from local merchants such as a restaurant, beauty & wellness, services, etc.
  23. #               Travel: Coupons for travelling such as trips, hotels, etc.
  24. #               Shopping: Coupons for products such as phones, devices, watches, etc.
  25.  
  26. import urllib2
  27.  
  28. # ----- Begin initialize -----
  29.  
  30. seed_all = 'http://www.groupon.es/all-deals/oferta-nacional' # This page is only used to get the cities Groupon operates in
  31. seed_barcelona = 'barcelona' # This page will be used only for testing to reduce amount of pages to request (only Barcelona deals, not whole country)
  32.  
  33. cities = [] # List containing the cities in which Groupon.es operates and the relative url to the city deal mini-site
  34. deals_to_crawl =    {'Local': [],
  35.                      'Travel': [],
  36.                      'Shopping': []} # Dictionary with the urls to crawl for each category
  37. deals =             {'Local': [],
  38.                      'Travel': [],
  39.                      'Shopping': []} # Dictionary with deal for each category
  40.  
  41. number_cities_crawled = 0. # Keep track of number of cities crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
  42. total_cities_to_crawl = 0.
  43. number_deals_crawled = 0. # Keep track of number of deals crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
  44. total_deals_to_crawl = 0.
  45.  
  46. # ----- End initialize -----
  47. # ----- Begin procedures -----
  48.  
  49. def fetch_cities(seed): # Crawl seed page to find all cities Groupon.es operates in
  50.  
  51.     global total_cities_to_crawl
  52.  
  53.     def extract_cities(html):
  54.  
  55.         global total_cities_to_crawl
  56.  
  57.         code = html
  58.         start_pos = code.find('<div id="citySelectBox" ')
  59.         code = code[start_pos:]
  60.         for i in range(2): # Skip first two links as they are not cities
  61.             start_pos = code.find('<li class')
  62.             skip = len('<li class')
  63.             code = code[start_pos+skip:]
  64.             start_pos = code.find('</li>')
  65.             skip = len('</li>')
  66.             code = code[start_pos+skip:]
  67.  
  68.         while code.find('</ul>') > 20: # Detect end of list of cities
  69.             total_cities_to_crawl += 1
  70.  
  71.             # Start fetching cities - Relative URL
  72.             start_pos = code.find("window.location.href = 'http://www.groupon.es/deals/")
  73.             skip = len("window.location.href = 'http://www.groupon.es/deals/")
  74.             code = code[start_pos+skip:]
  75.             end_pos = code.find("';")
  76.             url = code[:end_pos]
  77.             code = code[end_pos:]
  78.  
  79.             # Fetch city name
  80.             start_pos = code.find('<span>')
  81.             skip = len('<span>')
  82.             code = code[start_pos+skip:]
  83.             end_pos = code.find('</span>')
  84.             skip = len('</span>')
  85.             name = code[:end_pos]
  86.             code = code[end_pos+skip:]
  87.  
  88.             # Add to dictionary
  89.             cities.append([name, url])
  90.  
  91.     if seed == seed_barcelona: # use seed_barcelona to reduce page requests and thus take less time to execute!
  92.         cities.append(['Barcelona', seed_barcelona])
  93.         total_cities_to_crawl += 1
  94.     else:
  95.         seed_response = urllib2.urlopen(seed)
  96.         seed_html = seed_response.read()
  97.         extract_cities(seed_html)
  98.  
  99. def crawl_city(url): # Crawl cities to find all URLs to all deals
  100.  
  101.     global number_cities_crawled
  102.  
  103.     city_response = urllib2.urlopen(url)
  104.     city_html = city_response.read()
  105.  
  106.     number_cities_crawled += 1
  107.  
  108.     def crawl_local(html): # Search for all local deals URLs in all cities (or only Barcelona if seed_barcelona was used)
  109.         code = html
  110.         start_pos = code.find('var itemsLocalDeals = [')
  111.         skip = len('var itemsLocalDeals = [')
  112.         code = code[start_pos+skip:]
  113.  
  114.         while code.find('dealPermaLink":"') < code.find('var itemsShoppingDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'local' category
  115.             start_pos = code.find('dealPermaLink":"')
  116.             skip = len('dealPermaLink":"')
  117.             code = code[start_pos+skip:]
  118.             end_pos = code.find('"')
  119.             url = code[:end_pos]
  120.             deals_to_crawl['Local'].append(url)
  121.             code = code[end_pos:]
  122.  
  123.     def crawl_shopping(html): # Search for shopping deals URLs only one as they do not depend on city
  124.         code = html
  125.         start_pos = code.find('var itemsShoppingDeals = [')
  126.         skip = len('var itemsShoppingDeals = [')
  127.         code = code[start_pos+skip:]
  128.  
  129.         while code.find('dealPermaLink":"') < code.find('var itemsTravelDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'shopping' category
  130.             start_pos = code.find('dealPermaLink":"')
  131.             skip = len('dealPermaLink":"')
  132.             code = code[start_pos+skip:]
  133.             end_pos = code.find('"')
  134.             url = code[:end_pos]
  135.             deals_to_crawl['Shopping'].append(url)
  136.             code = code[end_pos:]
  137.  
  138.     def crawl_travel(html): # Search for travel deals URLs only one as they do not depend on city
  139.         code = html
  140.         start_pos = code.find('var itemsTravelDeals = [')
  141.         skip = len('var itemsTravelDeals = [')
  142.         code = code[start_pos+skip:]
  143.  
  144.         while code.find('dealPermaLink":"') != -1: # If crawler can't find next 'dealPermaLink', that means there are no deals left
  145.             start_pos = code.find('dealPermaLink":"')
  146.             skip = len('dealPermaLink":"')
  147.             code = code[start_pos+skip:]
  148.             end_pos = code.find('"')
  149.             url = code[:end_pos]
  150.             deals_to_crawl['Travel'].append(url)
  151.             code = code[end_pos:]
  152.  
  153.     crawl_local(city_html) # Always crawl to fetch all deals from all cities
  154.     if len(deals_to_crawl['Shopping']) == 0: # Crawl only once, as deals from this category are independent from cities
  155.         crawl_shopping(city_html)
  156.     if len(deals_to_crawl['Travel']) == 0: # Crawl only once, as deals from this category are independent from cities
  157.         crawl_travel(city_html)
  158.  
  159.     print str( round( ( number_cities_crawled / total_cities_to_crawl ) * 100, 2)) + '% complete ...'
  160.  
  161. def fetch_deals():
  162.  
  163.     url_prefix = 'http://www.groupon.es/all-deals/'
  164.  
  165.     for city in cities:
  166.         deal_url = url_prefix + city[1]
  167.         crawl_city(deal_url)
  168.  
  169. def crawl_deals(type):
  170.  
  171.     global number_deals_crawled
  172.  
  173.     for i in range(len(deals_to_crawl[type])):
  174.         full_url = 'http://www.groupon.es' + deals_to_crawl[type][i]
  175.  
  176.         deal_response = urllib2.urlopen(full_url)
  177.         deal_html = deal_response.read()
  178.        
  179.         number_deals_crawled += 1
  180.         deal = []
  181.  
  182.         def clean_up_string(string):
  183.             output = ''
  184.             for i in range(len(string)):
  185.                 if 47 <= ord(string[i]) <= 57 or 65 <= ord(string[i]) <= 90 or 97 <= ord(string[i]) <= 122 or ord(string[i]) == 44: # Clean up any string from white space and any undesired characters. Allow only Numbers, small-caps and UPPER-CAPS and commas
  186.                     if ord(string[i]) == 44:
  187.                         output += "."
  188.                     else:
  189.                         output += string[i]
  190.             return output
  191.  
  192.         # Find price
  193.         start_pos = deal_html.find('Precio:')
  194.         deal_html = deal_html[start_pos:]
  195.         start_pos = deal_html.find('<span class="noWrap">')
  196.         skip = len('<span class="noWrap">')
  197.         deal_html = deal_html[start_pos+skip:]
  198.         end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
  199.         deal_price = deal_html[:end_pos]
  200.         deal_html = deal_html[end_pos:]
  201.         deal.append(float(clean_up_string(deal_price)))
  202.  
  203.         # Find discount
  204.         start_pos = deal_html.find('Descuento</td>')
  205.         if start_pos != -1: # Some Shopping deals don't have a discount displayed ... Assumption: Those products simply don't have a discount, therefore the crawler will append a 0
  206.             deal_html = deal_html[start_pos:]
  207.             start_pos = deal_html.find('<td class="col1">')
  208.             skip = len('<td class="col1">')
  209.             deal_html = deal_html[start_pos+skip:]
  210.             end_pos = deal_html.find('%')
  211.             deal_discount = deal_html[:end_pos]
  212.             deal_html = deal_html[end_pos:]
  213.             deal.append(float(clean_up_string(deal_discount)))
  214.  
  215.             # Find saving
  216.             start_pos = deal_html.find('<td>')
  217.             skip = len('<td>')
  218.             deal_html = deal_html[start_pos+skip:]
  219.             end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
  220.             deal_saving = deal_html[:end_pos]
  221.             deal_html = deal_html[end_pos:]
  222.             deal.append(float(clean_up_string(deal_saving)))
  223.         else:
  224.             deal_discount = 0.
  225.             deal_saving = 0.
  226.             deal.append(deal_discount)
  227.             deal.append(deal_saving)
  228.  
  229.         # Find number of people who bought coupon
  230.         start_pos = deal_html.find('<span id="jDealSoldAmount">')
  231.         if start_pos != -1: # If this string can't be found, that means that 0 people have bought the deal so far
  232.             skip = len('<span id="jDealSoldAmount">')
  233.             deal_html = deal_html[start_pos+skip:]
  234.             end_pos = deal_html.find('</span>')
  235.             deal_bought = deal_html[:end_pos]
  236.             deal.append(float(clean_up_string(deal_bought)))
  237.         else:
  238.             deal.append(0)
  239.  
  240.         deal.append(deals_to_crawl[type][i]) # Append deal relative URL as last element of list
  241.  
  242.         # Add to dictionary
  243.         deals[type].append(deal)
  244.         print str( round( ( number_deals_crawled / total_deals_to_crawl ) * 100, 2)) + '% complete ...'
  245.  
  246. def reporting(type, print_forecast = False):
  247.  
  248.     def forecast(daily_sales):
  249.     # NOTE: This by no means is a very rigorous method of forecasting yearly sales, but is probably the best we can do with 'one-time' data
  250.     # Just imagine what you could do with a slightly more sophisticated robot!!!
  251.         return daily_sales * 365
  252.  
  253.     def format_number(number):
  254.         string = str(number)
  255.         formatted = ''
  256.         formatted_reverse = ''
  257.         for i in range(len(string)):
  258.             if (i+1) % 3 == 0 and i < len(string)-1:
  259.                 formatted += string[len(string)-i-1]+','
  260.             else:
  261.                 formatted += string[len(string)-i-1]
  262.  
  263.         for i in range(len(formatted)):
  264.             formatted_reverse += formatted[len(formatted)-i-1]
  265.         return formatted_reverse
  266.  
  267.     if type != 'Global':
  268.         number_deals = len(deals[type])
  269.         min_price = 9999999.
  270.         max_price = -1.
  271.         acumulated_price = 0.
  272.         min_discount = 100.
  273.         max_discount = 0.
  274.         acumulated_discount = 0.
  275.         min_saving = 9999999.
  276.         max_saving = -1.
  277.         acumulated_saving = 0.
  278.         total_sales_units = 0.
  279.         total_sales_euros = 0.
  280.         total_saving_euros = 0.
  281.         best_deal = [0., '']
  282.         for i in deals[type]:
  283.             if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
  284.                 best_deal[0] = i[0] * i[3]
  285.                 best_deal[1] = i[4]
  286.             if i[0] < min_price:
  287.                 min_price = i[0]
  288.             if i[0] > max_price:
  289.                 max_price = i[0]
  290.             if i[1] < min_discount:
  291.                 min_discount = i[1]
  292.             if i[1] > max_discount:
  293.                 max_discount = i[1]
  294.             if i[2] < min_saving:
  295.                 min_saving = i[2]
  296.             if i[2] > max_saving:
  297.                 max_saving = i[2]
  298.             total_sales_units += i[3]
  299.             total_sales_euros += i[0] * i[3]
  300.             total_saving_euros += i[2] * i[3]
  301.             acumulated_price += i[0]
  302.             acumulated_discount += i[1]
  303.             acumulated_saving += i[2]
  304.         avg_deal_price = acumulated_price / number_deals
  305.         avg_deal_discount = acumulated_discount / number_deals
  306.         avg_deal_saving = acumulated_saving / number_deals
  307.         avg_sale_value = total_sales_euros / total_sales_units
  308.         avg_sale_saving = total_saving_euros / total_sales_units
  309.         string = '############# '+type+' #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
  310.         if print_forecast:
  311.             string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
  312.         return string
  313.     else:
  314.         number_deals = len(deals['Shopping']) + len(deals['Travel']) + len(deals['Local'])
  315.         min_price = 9999999.
  316.         max_price = -1.
  317.         acumulated_price = 0.
  318.         min_discount = 100.
  319.         max_discount = 0.
  320.         acumulated_discount = 0.
  321.         min_saving = 9999999.
  322.         max_saving = -1.
  323.         acumulated_saving = 0.
  324.         total_sales_units = 0.
  325.         total_sales_euros = 0.
  326.         total_saving_euros = 0.
  327.         best_deal = [0., '']
  328.         for e in deals:
  329.             for i in deals[e]:
  330.                 if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
  331.                     best_deal[0] = i[0] * i[3]
  332.                     best_deal[1] = i[4]
  333.                 if i[0] < min_price:
  334.                     min_price = i[0]
  335.                 if i[0] > max_price:
  336.                     max_price = i[0]
  337.                 if i[1] < min_discount:
  338.                     min_discount = i[1]
  339.                 if i[1] > max_discount:
  340.                     max_discount = i[1]
  341.                 if i[2] < min_saving:
  342.                     min_saving = i[2]
  343.                 if i[2] > max_saving:
  344.                     max_saving = i[2]
  345.                 total_sales_units += i[3]
  346.                 total_sales_euros += i[0] * i[3]
  347.                 total_saving_euros += i[2] * i[3]
  348.                 acumulated_price += i[0]
  349.                 acumulated_discount += i[1]
  350.                 acumulated_saving += i[2]
  351.         avg_deal_price = acumulated_price / number_deals
  352.         avg_deal_discount = acumulated_discount / number_deals
  353.         avg_deal_saving = acumulated_saving / number_deals
  354.         avg_sale_value = total_sales_euros / total_sales_units
  355.         avg_sale_saving = total_saving_euros / total_sales_units
  356.         string = '############# Global #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
  357.         if print_forecast:
  358.             string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
  359.         return string
  360.  
  361. # ----- End procedures -----
  362. # ----- Begin control panel -----
  363.  
  364. # Turn these to True or False depending on which ones you want to crawl. No changes further below are necessary, unless you want to fiddle around with the crawler ;)
  365.  
  366. switch_seed_all = False # True: crawls ALL deals on the site; False: crawls only deals in Barcelona. Leave as False for faster execution.
  367. switch_crawl_shopping = True
  368. switch_crawl_travel = True
  369. switch_crawl_local = True
  370. switch_print_global_report = True # Print a global report of all 3 types of deals (only turn True if previous 3 switches are True)
  371. switch_print_forecast = True # Print a (very non-scientific) yearly sales forecast
  372.  
  373. # ----- End control panel -----
  374.  
  375. if switch_seed_all:
  376.     if not switch_crawl_local and not switch_crawl_shopping and not switch_crawl_travel:
  377.         print 'Switches are set to crawl nothing ...'
  378.     else:
  379.         print 'Crawling seed site ...'
  380.  
  381.         fetch_cities(seed_all)
  382.  
  383.         print 'Found '+str(len(cities))+' cities to crawl ...'
  384.  
  385.         fetch_deals()
  386.  
  387.         total_deals_to_crawl = 0
  388.         if switch_crawl_shopping:
  389.             total_deals_to_crawl += len(deals_to_crawl['Shopping'])
  390.         if switch_crawl_travel:
  391.             total_deals_to_crawl += len(deals_to_crawl['Travel'])
  392.         if switch_crawl_local:
  393.             total_deals_to_crawl += len(deals_to_crawl['Local'])
  394.  
  395.         print 'Found '+str(total_deals_to_crawl)+' deals to crawl ...'
  396.  
  397.         if switch_crawl_shopping:
  398.             crawl_deals('Shopping')
  399.         if switch_crawl_travel:
  400.             crawl_deals('Travel')
  401.         if switch_crawl_local:
  402.             crawl_deals('Local')
  403.  
  404.         if switch_crawl_shopping:
  405.             print reporting('Shopping', switch_print_forecast)
  406.         if switch_crawl_travel:
  407.             print reporting('Travel', switch_print_forecast)
  408.         if switch_crawl_local:
  409.             print reporting('Local', switch_print_forecast)
  410.         if switch_print_global_report:
  411.             print reporting('Global', switch_print_forecast)
  412. else:
  413.     if not switch_crawl_local and not switch_crawl_shopping and not switch_crawl_travel:
  414.         print 'Switches are set to crawl nothing ...'
  415.     else:
  416.         print 'Crawling seed site ...'
  417.  
  418.         fetch_cities(seed_barcelona)
  419.  
  420.         print 'Found '+str(len(cities))+' cities to crawl ...'
  421.  
  422.         fetch_deals()
  423.  
  424.         total_deals_to_crawl = 0
  425.         if switch_crawl_shopping:
  426.             total_deals_to_crawl += len(deals_to_crawl['Shopping'])
  427.         if switch_crawl_travel:
  428.             total_deals_to_crawl += len(deals_to_crawl['Travel'])
  429.         if switch_crawl_local:
  430.             total_deals_to_crawl += len(deals_to_crawl['Local'])
  431.  
  432.         print 'Found '+str(total_deals_to_crawl)+' deals to crawl ...'
  433.  
  434.         if switch_crawl_shopping:
  435.             crawl_deals('Shopping')
  436.         if switch_crawl_travel:
  437.             crawl_deals('Travel')
  438.         if switch_crawl_local:
  439.             crawl_deals('Local')
  440.  
  441.         if switch_crawl_shopping:
  442.             print reporting('Shopping', switch_print_forecast)
  443.         if switch_crawl_travel:
  444.             print reporting('Travel', switch_print_forecast)
  445.         if switch_crawl_local:
  446.             print reporting('Local', switch_print_forecast)
  447.         if switch_print_global_report:
  448.             print reporting('Global', switch_print_forecast)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement