Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # athletic parser
- import time
- from grab.spider import Spider, Task, Data
- from weblib.logs import default_logging
- import re
- import sys
- import os
- import config
- def check_loaded(html='', slogan=config.SLOGAN):
- """
- Функция для проверки корректности загрузки страницы.
- Проверяется посредством присутствия последовательности slogan в странице.
- """
- is_ok = slogan in str(html)
- return is_ok
- class LbCrawler(Spider):
- initial_urls = config.INITIAL_URLS
- base_url = config.INITIAL_URLS[0]
- parsed_url = []
- def task_initial(self, grab, task):
- if not check_loaded(grab.response.unicode_body()):
- print('Can\'t start parsing on', task.url)
- for category_url in grab.doc.select('//*[@class="group"]/a').attr_list('href'):
- yield Task('category', url=grab.make_url_absolute(str(category_url)), priority=95)
- def task_category(self, grab, task):
- if not check_loaded(grab.response.unicode_body()):
- yield task.clone(refresh_cache=True, priority=70)
- return
- for product_url in grab.doc.select('//*[@class="image"]/a').attr_list('href'):
- print('Trying to parsing: ' + product_url)
- print('Category: ' + task.url)
- time.sleep(.03)
- yield Task('product',url=grab.make_url_absolute(str(product_url)), priority=95)
- def task_product(self, grab, task):
- if not check_loaded(grab.response.body):
- time.sleep(.05)
- yield task.clone(refresh_cache=True, priority=60)
- return
- print('Parsing product ... ' + task.url)
- for product_table in grab.doc.select('//div[@class="browse-ttl"]/a').attr_list('href'):
- yield Task('level_2', url=grab.make_url_absolute(str(url_level_2)), priority=85)
- def start_parsing():
- default_logging(grab_log=config.GRAB_LOG, network_log=config.NETWORK_LOG)
- bot = LbCrawler(thread_number=config.THREAD_NUMBER)
- bot.cache_enabled = False
- bot.cache = None
- bot.charset = 'utf-8'
- # bot.load_proxylist(config.PROXY_LIST, 'text_file', proxy_type='http') пока без проксей
- bot.proxylist_enabled = False
- bot.proxy = None
- try:
- bot.run()
- except KeyboardInterrupt:
- pass
- # if config.DEBUG:
- # bot.save_list('fatal', config.FATAL_ERROR_DUMP)
- # comp_db.session.commit()
- print(bot.render_stats())
- sys.exit()
- if __name__ == '__main__':
- start_parsing()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement