Advertisement
Guest User

scrapy spider

a guest
May 8th, 2016
413
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.62 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. #scrapy crawl ia_checkr -o IA_OUT.csv -t csv
  3.  
  4. import time
  5. import logging
  6.  
  7. import scrapy
  8. from scrapy.http import Request
  9. from scrapy.selector import Selector
  10. from scrapy.spiders import CrawlSpider, Rule
  11.  
  12. from ia_check.items import Check_Item
  13. from datetime import datetime
  14.  
  15. import ia_check
  16.  
  17. class CheckSpider(CrawlSpider):
  18.     name = "ia_check"
  19.     handle_httpstatus_list = [404,429,503]
  20.  
  21.     start_urls = [
  22.     "http://www.amazon.com/Easy-Smart-Touch-Action-Games/dp/B00PRH5UJW",
  23.     "http://www.amazon.com/mobile9-LAZYtube-MP4-Video-Downloader/dp/B00KFITEV8",
  24.     "http://www.amazon.com/Forgress-Storyteller-Audiobook-Pro/dp/B00J0T73XO",
  25.     "http://www.amazon.com/cgt-MP3-Downloader/dp/B00O65Z0RS",
  26.     "http://www.amazon.com/DoomsDayBunny-Squelch-Free-Music-Downloader/dp/B00N3DDDRI"
  27.     ]
  28.  
  29.     def timestamp(self):
  30.         now = datetime.now()
  31.         return "%s-%s-%sT%s:%s:%sZ" %\
  32.             (now.year, now.month, now.day, now.hour, now.minute, now.second)
  33.  
  34.     def start_requests(self):
  35.         for url in self.start_urls:
  36.             yield scrapy.Request(url, self.parse, meta={
  37.                 'splash': {
  38.                     'endpoint': 'render.html',
  39.                     'args': {'wait': 1}
  40.                 }
  41.             })
  42.  
  43.     def parse(self, response):
  44.         ResultsDict = Check_Item()
  45.         Select = Selector(response).xpath
  46.  
  47.         ResultsDict['status'] = Select(".//*[@class='h1']/text()|.//*[@id='btAsinTitle']/text()").extract()
  48.         ResultsDict['application_url'] = response.url
  49.         ResultsDict['timestamp'] = self.timestamp()
  50.         return ResultsDict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement