Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class MySpider(CrawlSpider):
- name = "scraper"
- allowed_domains = ["amazon.com"]
- start_urls = ["http://www.amazon.com/s/ref=sr_pg_3?rh=n%3A133140011%2Cn%3A%21133141011%2Cn%3A154606011%2Cn%3A668010011%2Cn%3A158591011%2Cn%3A158592011&page=3&bbn=158591011&ie=UTF8&qid=1403264902"]
- rules = [Rule(SgmlLinkExtractor(allow=('\/dp\/B00.*digital-text')),callback='parse_items')]
- def parse_items(self, response):
- sel=Selector(response)
- items = []
- url=response.url
- item = AmazonItem()
- print 'inside'
- print sel.css('#btAsinTitle::text').extract()
- item ["title"] = ''.join(sel.css('#btAsinTitle::text').extract())
- print '-----',item["title"]
- print response.url
- item["digitalprice"] = ''.join(sel.css('.digitalListPrice>.listprice::text').extract())
- item["digitalprice"]=re.sub('\s+','',item["digitalprice"])
- item["listprice"] = ''.join(sel.css('.listPrice::text').extract())
- item["listprice"]=re.sub('\s+','',item["listprice"])
- item["kindleprice"] = ''.join(sel.css('.priceLarge::text').extract())
- item["kindleprice"]=re.sub('\s+','',item["kindleprice"])
- item["author"]=''.join(sel.css('span.contributorNameTrigger a::text').extract())
- item["author"]=re.sub('\s+','',item["author"])
- item["reviews"]=''.join(sel.css('#divsinglecolumnminwidth > div:nth-child(34) > span > span > a::text').extract())
- item["reviews"]=re.sub('\s+','',item["reviews"])
- item["reviews"] = ''.join([i for i in item["reviews"] if i.isdigit()])
- #if item["digitalprice"] != None and item["listprice"] != None and item["kindleprice"] != None and item["author"] != None and item["reviews"] !=None:
- items.append(item)
- print items
- return items
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement