Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- #scrapy crawl ia_checkr -o IA_OUT.csv -t csv
- import time
- import logging
- import scrapy
- from scrapy.http import Request
- from scrapy.selector import Selector
- from scrapy.spiders import CrawlSpider, Rule
- from ia_check.items import Check_Item
- from datetime import datetime
- import ia_check
- class CheckSpider(CrawlSpider):
- name = "ia_check"
- handle_httpstatus_list = [404,429,503]
- start_urls = [
- "http://www.amazon.com/Easy-Smart-Touch-Action-Games/dp/B00PRH5UJW",
- "http://www.amazon.com/mobile9-LAZYtube-MP4-Video-Downloader/dp/B00KFITEV8",
- "http://www.amazon.com/Forgress-Storyteller-Audiobook-Pro/dp/B00J0T73XO",
- "http://www.amazon.com/cgt-MP3-Downloader/dp/B00O65Z0RS",
- "http://www.amazon.com/DoomsDayBunny-Squelch-Free-Music-Downloader/dp/B00N3DDDRI"
- ]
- def timestamp(self):
- now = datetime.now()
- return "%s-%s-%sT%s:%s:%sZ" %\
- (now.year, now.month, now.day, now.hour, now.minute, now.second)
- def start_requests(self):
- for url in self.start_urls:
- yield scrapy.Request(url, self.parse, meta={
- 'splash': {
- 'endpoint': 'render.html',
- 'args': {'wait': 1}
- }
- })
- def parse(self, response):
- ResultsDict = Check_Item()
- Select = Selector(response).xpath
- ResultsDict['status'] = Select(".//*[@class='h1']/text()|.//*[@id='btAsinTitle']/text()").extract()
- ResultsDict['application_url'] = response.url
- ResultsDict['timestamp'] = self.timestamp()
- return ResultsDict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement