Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from xls_prep import create_n_fill, read
- from config import *
- import logging
- from grab.spider import Spider, Task
- from datetime import datetime
- import shutil
- import requests
- import pprint
- import json
- import csv
- attributes = []
- categories = []
- items = {}
- product_id = 0
- class Crawler(Spider):
- initial_urls = [SITE_URL]
- def task_initial(self, grab, task):
- if VERBOSE:
- print("TASK : initial " + task.url)
- grab.setup(proxy=PROXY_LIST[1])
- for cat in grab.doc.select(INITIAL_SELECTOR):
- link = cat.attr('href')
- cat_name = cat.text()
- categories.append(cat_name)
- yield Task('category', url=link, cat=cat_name)
- def task_category(self, grab, task):
- if VERBOSE:
- print("TASK : category " + task.cat)
- for k in grab.doc.select(CATEGORY_SELECTOR):
- item_link=k.attr("href")
- # product_id += 1
- if task.cat not in categories:
- categories.append(task.cat)
- yield Task('item',url=item_link, cat=task.cat, product_id=product_id)
- def task_item(self, grab, task):
- if VERBOSE:
- print("TASK : item " + task.url)
- item_pics = []
- item_title = grab.doc.select(ITEM_TITLE_SELECTOR).text()
- item_price = grab.doc.select(ITEM_PRICE_SELECTOR).text().replace(',','')
- item_attr = grab.doc.select(ITEM_ATTR_SELECTOR).html()
- item_attr = clear_desc(item_attr)
- item_desc = grab.doc.select(ITEM_DESC_SELECTOR).text()
- if item_desc.startswith('Описание товара'):
- item_desc = item_desc[15:]
- item_cat = categories.index(task.cat) +1 # adapt to opencart category numeration
- product_id = len(items)
- node_pics = grab.doc.select('//div[contains(@class,"images")]')
- for elem in node_pics:
- item_pics.append(elem.select('./a').attr('href'))
- node_pics = grab.doc.select('//div[contains(@class,"thumbnails")]')
- for elem in node_pics:
- if elem not in item_pics:
- item_pics.append(elem.select('./a').attr('href'))
- item_attr = attr_divide(item_attr)
- if task.url not in items.keys():
- items[task.url] = {
- 'id': product_id,\
- 'title':item_title,\
- 'price':item_price,\
- 'cat': item_cat,\
- 'desc': item_desc,\
- 'pics': item_pics,\
- 'attr': item_attr}
- # # print(items)
- # print(item_price)
- # print(item_cat)
- # print(item_desc + '\n')
- # print(item_pics)
- # for item in item_pics:
- # save_img(item)
- def attr_divide(attr):
- attr = attr.replace('\xa0',' ').replace('\r','').replace('<div class="prop_table">','').replace('<div>','').replace('<span class="prop_name">','').replace('<span style="line-height','').strip()
- item_attr = attr.split('\n')
- for p in item_attr:
- if p == 'Габариты ':
- item_attr[item_attr.index(p)] = 'Габариты: '
- attr_key= p.split(':')[0]
- if attr_key not in attributes:
- attributes.append(attr_key)
- print(attr_key)
- # return(item_attr)
- def save_img(url):
- filename = url.split('/')[-1]
- response = requests.get(url, stream=True)
- with open(filename, 'wb') as out_file:
- shutil.copyfileobj(response.raw, out_file)
- del response
- print(filename + ' saved')
- return
- def clear_desc(item_desc):
- return item_desc.replace('<p>','').\
- replace('</p>',' ').\
- replace('<p class="record_title">','').\
- replace('<div itemprop="description">','').\
- replace('</div>','').replace('\t','')
- def prepare_xlsx():
- create_n_fill(PRODUCTS_NAME, PRODUCTS_HEADERS, PRODUCTS_SHEETS)
- create_n_fill(CATEGORIES_NAME , CATEGORIES_HEADERS , CATEGORIES_SHEETS)
- def trim(url): # absolute url to relative
- new_url = url[len(SITE_URL)+1:]
- return new_url
- def sql_export_attributes():
- print('TRUNCATE TABLE `oc_attribute_description`;')
- for p in attributes:
- print('INSERT INTO `oc_attribute_description` (`attribute_id`, `language_id`, `name`) VALUES (\''+ str(100+attributes.index(p)) + '\', \'' + str(1) +'\', \''+ p + '\');')
- print('INSERT INTO `oc_attribute_description` (`attribute_id`, `language_id`, `name`) VALUES (\''+ str(100+attributes.index(p)) + '\', \'' + str(2) +'\', \''+ p + '\');')
- def main():
- bot = Crawler()
- # prepare_xlsx()
- logging.basicConfig(filename='ledron.log',level=logging.DEBUG)
- try:
- bot.run()
- except KeyboardInterrupt:
- print('Keyboard Interrupt')
- read('import-export/products.xlsx',items.values())
- sql_export_attributes()
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement