Advertisement
Guest User

Untitled

a guest
Jul 30th, 2016
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.80 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. # Created on 2016-07-14 23:05:17
  4. # Project: cc
  5. import logging
  6. import logging.config
  7. import os.path
  8.  
  9. def initialize_logger(output_dir):
  10. logger = logging.getLogger()
  11. logger.setLevel(logging.DEBUG)
  12.  
  13. # create console handler and set level to info
  14. handler = logging.FileHandler(os.path.join(output_dir, "error.log"),"w", encoding=None, delay="true")
  15. handler.setLevel(logging.INFO)
  16. formatter = logging.Formatter('%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s')
  17. handler.setFormatter(formatter)
  18. logger.addHandler(handler)
  19.  
  20. # create error file handler and set level to error
  21. handler = logging.FileHandler(os.path.join(output_dir, "error.log"),"w", encoding=None, delay="true")
  22. handler.setLevel(logging.ERROR)
  23. formatter = logging.Formatter('%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s')
  24. handler.setFormatter(formatter)
  25. logger.addHandler(handler)
  26.  
  27. # create debug file handler and set level to debug
  28. handler = logging.FileHandler(os.path.join(output_dir, "all.log"),"w")
  29. handler.setLevel(logging.DEBUG)
  30. formatter = logging.Formatter('%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s')
  31. handler.setFormatter(formatter)
  32. logger.addHandler(handler)
  33.  
  34. initialize_logger('C:/Users/zack/Desktop/scraper')
  35. logging.info("test")
  36.  
  37. from pyspider.libs.base_handler import *
  38.  
  39. class Handler(BaseHandler):
  40. crawl_config = {
  41. 'headers': {
  42. 'Cookie': 'SRCHHPGUSR=ADLT=OFF',
  43. 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
  44. 'X-Requested-With' : 'XMLHttpRequest',
  45. 'cache-control' : 'no-cache'
  46. }
  47. }
  48.  
  49. @every(minutes=24 * 60)
  50. def on_start(self):
  51.  
  52. logger.info("test")
  53. self.crawl('http://www.bing.com/videos/search?q=test&first=36&count=35&FORM=PEVR1', callback=self.index_page)
  54. #self.crawl('https://www.google.com.au/search?q=test&ie=utf-8&oe=utf-8&client=firefox-b-ab&gfe_rd=cr&ei=pEOcV7nBN9LN8gfMpov4Ag', callback=self.index_page)
  55. return
  56.  
  57. @config(age=10 * 24 * 60 * 60)
  58. def index_page(self, response):
  59. for each in response.doc('a[href^="http"]').items():
  60. self.crawl(each.attr.href, callback=self.detail_page)
  61.  
  62. @config(priority=2)
  63. def detail_page(self, response):
  64. return {
  65. "url": response.url,
  66. "title": response.doc('title').text(),
  67. }
  68.  
  69. def on_message(self, project, msg):
  70. #print(msg)
  71. logging.info("test")
  72. #logger.info(msg)
  73. self.crawl(msg, callback=self.index_page)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement