Advertisement
Guest User

Untitled

a guest
Jun 20th, 2014
254
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.80 KB | None | 0 0
  1. class MySpider(CrawlSpider):
  2.     name = "scraper"
  3.     allowed_domains = ["amazon.com"]
  4.     start_urls = ["http://www.amazon.com/s/ref=sr_pg_3?rh=n%3A133140011%2Cn%3A%21133141011%2Cn%3A154606011%2Cn%3A668010011%2Cn%3A158591011%2Cn%3A158592011&page=3&bbn=158591011&ie=UTF8&qid=1403264902"]
  5.  
  6.     rules = [Rule(SgmlLinkExtractor(allow=('\/dp\/B00.*digital-text')),callback='parse_items')]
  7.  
  8.     def parse_items(self, response):
  9.  
  10.         sel=Selector(response)
  11.         items = []
  12.         url=response.url
  13.         item = AmazonItem()
  14.         print 'inside'
  15.         print sel.css('#btAsinTitle::text').extract()
  16.         item ["title"] = ''.join(sel.css('#btAsinTitle::text').extract())
  17.         print '-----',item["title"]
  18.         print response.url
  19.         item["digitalprice"] = ''.join(sel.css('.digitalListPrice>.listprice::text').extract())
  20.         item["digitalprice"]=re.sub('\s+','',item["digitalprice"])
  21.         item["listprice"] = ''.join(sel.css('.listPrice::text').extract())
  22.         item["listprice"]=re.sub('\s+','',item["listprice"])
  23.         item["kindleprice"] = ''.join(sel.css('.priceLarge::text').extract())
  24.         item["kindleprice"]=re.sub('\s+','',item["kindleprice"])
  25.         item["author"]=''.join(sel.css('span.contributorNameTrigger a::text').extract())
  26.         item["author"]=re.sub('\s+','',item["author"])
  27.         item["reviews"]=''.join(sel.css('#divsinglecolumnminwidth > div:nth-child(34) > span > span > a::text').extract())
  28.         item["reviews"]=re.sub('\s+','',item["reviews"])
  29.         item["reviews"] = ''.join([i for i in item["reviews"] if i.isdigit()])
  30.  
  31.  
  32.         #if item["digitalprice"] != None and item["listprice"] != None and item["kindleprice"] != None and item["author"] != None and item["reviews"] !=None:
  33.         items.append(item)
  34.         print items
  35.         return items
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement