Advertisement
lexquarkie

vk_shop_parser.py

Jul 26th, 2016
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.65 KB | None | 0 0
  1. from datetime import datetime
  2. from grab.spider import Spider, Task
  3. import json
  4. import logging
  5. import re
  6. from selenium import webdriver
  7.  
  8. data = {}
  9.  
  10. class Crawler(Spider):
  11.     initial_urls = ['https://vk.com/spark.design']
  12.  
  13.     def task_initial(self, grab, task):
  14.         shop_offset = 0
  15.         print("Try to parse: " + task.url)
  16.         shop_url_selector = grab.doc.select('//*[@id="ui_market_items_load_more"]').attr('onclick')
  17.         re_shop_url = re.compile('market-(\d{1,12})+')
  18.         shop_url = re_shop_url.search(shop_url_selector).group(0)  # 'market-NNNNNN'
  19.         shop_number = re_shop_url.search(shop_url_selector).group(1) # 'NNNNNN'
  20.         shop_full_url = ("https://vk.com/" + shop_url)
  21.         print(shop_url)
  22.         shop_itemscount = grab.doc.select('//*[@class="module clear market_module _module"]//*[@class="header_count fl_l"]').text()
  23.         while shop_offset < int(shop_itemscount):
  24.             yield Task('showcase', url=shop_full_url + '?offset=' + str(shop_offset),
  25.                         shop_key=shop_url, shop_num=shop_number, offset=shop_offset)
  26.             shop_offset += 24
  27.  
  28.     def task_showcase(self, grab, task):
  29.         print("Go: "+task.url)
  30.         re_price = re.compile('>(\d+)\D(\d*)')
  31.         item_id = 0 + task.offset
  32.         for item_node in grab.doc.select('//div[@class="market_list"]/div'):
  33.             item_id += 1
  34.             item_attributes = {}
  35.             item_native_id = item_node.attr('data-id')
  36.             item_img = item_node.select('div/div/a/img').attr('src')
  37.             item_price_raw = item_node.select('*/div[@class="market_row_price"]').html()
  38.             item_price = int(re_price.search(item_price_raw).group(1))
  39.             item_price_2 = re_price.search(item_price_raw).group(2)
  40.             if item_price_2:    # remove digit delimiter if price > 1000 (dumb, but working)
  41.                 item_price = item_price*1000 + int(item_price_2)
  42.             item_attributes = {"id": item_id,
  43.                                "native_id": item_native_id,
  44.                                "img_url": item_img,
  45.                                "price": item_price,
  46.                                "name": "",
  47.                                "cat": ""}
  48.             item_details(item_attributes=item_attributes,shop=task.shop_num, item_native_id=item_native_id,item_key=item_id)
  49.  
  50. def item_details(item_attributes,shop,item_native_id,item_key):
  51.         d = webdriver.PhantomJS()  
  52.         url = 'http://vk.com/market-' + str(shop) + '?w=product-' + str(shop) + '_'+str(item_native_id)
  53.         d.get(url)
  54.         d.implicitly_wait(.9)
  55.         item_desc = d.find_element_by_id("market_item_description").text
  56.         item_cat = d.find_element_by_class_name("market_item_category").text
  57.         item_attributes['desc'] = item_desc
  58.         item_attributes['cat'] = item_cat
  59.         print(item_attributes)
  60.         save_item(item_attributes=item_attributes, shop=shop, item_key=item_key)
  61.  
  62.  
  63. def save_item(item_attributes='',shop='',item_key=''):
  64.     if shop not in data.keys():
  65.         data[shop]={}
  66.     if item_key not in data[shop].keys():
  67.         data[shop][item_key]={}
  68.     for key in item_attributes:
  69.         if key not in data[shop][item_key].keys():
  70.             data[shop][item_key][key]=item_attributes[key]
  71.     return
  72.  
  73. def export_file(data,filename):
  74.     filename = filename
  75.     with open(filename, 'w') as f:
  76.         json.dump(data, f)
  77.     return json.dumps(data)
  78.  
  79. def main():
  80.    
  81.     bot = Crawler()
  82.     bot.run
  83.     try:
  84.         bot.run()
  85.     except KeyboardInterrupt:
  86.         print('Keyboard Interrupt')
  87.     print(export_file(data,'out.json'))
  88.    
  89. if __name__ == '__main__':
  90.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement