Advertisement
lexquarkie

pars_ledron

Feb 28th, 2017
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.03 KB | None | 0 0
  1. from xls_prep import create_n_fill, read
  2. from config import *
  3.  
  4. import logging
  5. from grab.spider import Spider, Task
  6. from datetime import datetime
  7.  
  8. import shutil
  9. import requests
  10.  
  11. import pprint
  12. import json
  13. import csv
  14.  
  15. attributes = []
  16. categories = []
  17. items = {}
  18. product_id = 0
  19.  
  20. class Crawler(Spider):
  21.     initial_urls = [SITE_URL]
  22.  
  23.     def task_initial(self, grab, task):  
  24.         if VERBOSE:
  25.             print("TASK : initial  " + task.url)
  26.         grab.setup(proxy=PROXY_LIST[1])
  27.  
  28.         for cat in grab.doc.select(INITIAL_SELECTOR):
  29.             link = cat.attr('href')
  30.             cat_name = cat.text()          
  31.             categories.append(cat_name)
  32.             yield Task('category', url=link, cat=cat_name)
  33.  
  34.  
  35.     def task_category(self, grab, task):
  36.         if VERBOSE:
  37.             print("TASK : category  " + task.cat)
  38.        
  39.         for k in grab.doc.select(CATEGORY_SELECTOR):
  40.            
  41.             item_link=k.attr("href")
  42.             # product_id += 1
  43.             if task.cat not in categories:
  44.                 categories.append(task.cat)
  45.             yield Task('item',url=item_link, cat=task.cat, product_id=product_id)
  46.  
  47.     def task_item(self, grab, task):
  48.        
  49.         if VERBOSE:
  50.             print("TASK : item  " + task.url)
  51.         item_pics = []
  52.        
  53.         item_title = grab.doc.select(ITEM_TITLE_SELECTOR).text()
  54.         item_price = grab.doc.select(ITEM_PRICE_SELECTOR).text().replace(',','')
  55.         item_attr = grab.doc.select(ITEM_ATTR_SELECTOR).html()
  56.         item_attr = clear_desc(item_attr)
  57.         item_desc = grab.doc.select(ITEM_DESC_SELECTOR).text()
  58.         if item_desc.startswith('Описание товара'):
  59.             item_desc = item_desc[15:]
  60.         item_cat = categories.index(task.cat) +1  # adapt to opencart category numeration
  61.         product_id = len(items)
  62.         node_pics = grab.doc.select('//div[contains(@class,"images")]')
  63.         for elem in node_pics:
  64.             item_pics.append(elem.select('./a').attr('href'))
  65.  
  66.         node_pics = grab.doc.select('//div[contains(@class,"thumbnails")]')
  67.         for elem in node_pics:
  68.             if elem not in item_pics:
  69.                 item_pics.append(elem.select('./a').attr('href'))
  70.  
  71.         item_attr = attr_divide(item_attr)
  72.        
  73.         if task.url not in items.keys():
  74.             items[task.url] = {
  75.                 'id': product_id,\
  76.                 'title':item_title,\
  77.                 'price':item_price,\
  78.                 'cat': item_cat,\
  79.                 'desc': item_desc,\
  80.                 'pics': item_pics,\
  81.                 'attr': item_attr}
  82.        
  83.         # # print(items)
  84.      
  85.        
  86.         # print(item_price)
  87.         # print(item_cat)
  88.         # print(item_desc + '\n')
  89.         # print(item_pics)
  90.         # for item in item_pics:
  91.         #     save_img(item)        
  92.  
  93. def attr_divide(attr):
  94.     attr = attr.replace('\xa0',' ').replace('\r','').replace('<div class="prop_table">','').replace('<div>','').replace('<span class="prop_name">','').replace('<span style="line-height','').strip()
  95.     item_attr = attr.split('\n')
  96.     for p in item_attr:
  97.  
  98.         if p == 'Габариты ':
  99.             item_attr[item_attr.index(p)] = 'Габариты: '
  100.         attr_key= p.split(':')[0]
  101.         if attr_key not in attributes:
  102.             attributes.append(attr_key)
  103.         print(attr_key)
  104.     # return(item_attr)
  105.  
  106. def save_img(url):
  107.     filename = url.split('/')[-1]
  108.     response = requests.get(url, stream=True)
  109.    
  110.     with open(filename, 'wb') as out_file:
  111.         shutil.copyfileobj(response.raw, out_file)
  112.     del response
  113.     print(filename + '  saved')
  114.     return
  115.  
  116. def clear_desc(item_desc):
  117.     return item_desc.replace('<p>','').\
  118.                     replace('</p>',' ').\
  119.                     replace('<p class="record_title">','').\
  120.                     replace('<div itemprop="description">','').\
  121.                     replace('</div>','').replace('\t','')
  122.  
  123.  
  124. def prepare_xlsx():
  125.     create_n_fill(PRODUCTS_NAME, PRODUCTS_HEADERS, PRODUCTS_SHEETS)
  126.     create_n_fill(CATEGORIES_NAME , CATEGORIES_HEADERS , CATEGORIES_SHEETS)
  127.    
  128. def trim(url): # absolute url to relative
  129.     new_url = url[len(SITE_URL)+1:]
  130.     return new_url
  131.  
  132.  
  133. def sql_export_attributes():
  134.     print('TRUNCATE TABLE `oc_attribute_description`;')
  135.     for p in attributes:
  136.         print('INSERT INTO `oc_attribute_description` (`attribute_id`, `language_id`, `name`) VALUES (\''+ str(100+attributes.index(p)) + '\', \'' + str(1) +'\', \''+ p + '\');')
  137.         print('INSERT INTO `oc_attribute_description` (`attribute_id`, `language_id`, `name`) VALUES (\''+ str(100+attributes.index(p)) + '\', \'' + str(2) +'\', \''+ p + '\');')
  138.  
  139. def main():
  140.     bot = Crawler()
  141.     # prepare_xlsx()
  142.    
  143.    
  144.     logging.basicConfig(filename='ledron.log',level=logging.DEBUG)
  145.    
  146.     try:
  147.         bot.run()
  148.     except KeyboardInterrupt:
  149.         print('Keyboard Interrupt')
  150.  
  151.     read('import-export/products.xlsx',items.values())
  152.     sql_export_attributes()
  153. if __name__ == '__main__':
  154.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement