Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- # Created on 2016-07-14 23:05:17
- # Project: cc
- import logging
- import logging.config
- import os.path
- def initialize_logger(output_dir):
- logger = logging.getLogger()
- logger.setLevel(logging.DEBUG)
- # create console handler and set level to info
- handler = logging.FileHandler(os.path.join(output_dir, "error.log"),"w", encoding=None, delay="true")
- handler.setLevel(logging.INFO)
- formatter = logging.Formatter('%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s')
- handler.setFormatter(formatter)
- logger.addHandler(handler)
- # create error file handler and set level to error
- handler = logging.FileHandler(os.path.join(output_dir, "error.log"),"w", encoding=None, delay="true")
- handler.setLevel(logging.ERROR)
- formatter = logging.Formatter('%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s')
- handler.setFormatter(formatter)
- logger.addHandler(handler)
- # create debug file handler and set level to debug
- handler = logging.FileHandler(os.path.join(output_dir, "all.log"),"w")
- handler.setLevel(logging.DEBUG)
- formatter = logging.Formatter('%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s')
- handler.setFormatter(formatter)
- logger.addHandler(handler)
- initialize_logger('C:/Users/zack/Desktop/scraper')
- logging.info("test")
- from pyspider.libs.base_handler import *
- class Handler(BaseHandler):
- crawl_config = {
- 'headers': {
- 'Cookie': 'SRCHHPGUSR=ADLT=OFF',
- 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
- 'X-Requested-With' : 'XMLHttpRequest',
- 'cache-control' : 'no-cache'
- }
- }
- @every(minutes=24 * 60)
- def on_start(self):
- logger.info("test")
- self.crawl('http://www.bing.com/videos/search?q=test&first=36&count=35&FORM=PEVR1', callback=self.index_page)
- #self.crawl('https://www.google.com.au/search?q=test&ie=utf-8&oe=utf-8&client=firefox-b-ab&gfe_rd=cr&ei=pEOcV7nBN9LN8gfMpov4Ag', callback=self.index_page)
- return
- @config(age=10 * 24 * 60 * 60)
- def index_page(self, response):
- for each in response.doc('a[href^="http"]').items():
- self.crawl(each.attr.href, callback=self.detail_page)
- @config(priority=2)
- def detail_page(self, response):
- return {
- "url": response.url,
- "title": response.doc('title').text(),
- }
- def on_message(self, project, msg):
- #print(msg)
- logging.info("test")
- #logger.info(msg)
- self.crawl(msg, callback=self.index_page)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement