Veeru15

Untitled

Oct 17th, 2017
175
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.98 KB | None | 0 0
  1. import scrapy
  2.  
  3. class TestSpider(scrapy.Spider):
  4. name = "testdoc1"
  5. allowed_domains = ['amazon.in']
  6. start_urls = ["https://www.amazon.in/s/ref=amb_link_46?ie=UTF8&bbn=1389432031&rh=i%3Aelectronics%2Cn%3A976419031%2Cn%3A%21976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cp_89%3AApple&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_s=merchandised-search-leftnav&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_t=101&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_i=1389401031"]
  7.  
  8. def parse(self, response):
  9. for post_link in response.xpath('//a/@href').extract():
  10. link = response.urljoin(post_link)
  11. yield scrapy.Request(link, callback=self.parse_post)
  12.  
  13. # Checks if the main page has a link to next page if True keep parsing.
  14. next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first()
  15. if next_page:
  16. yield scrapy.Request(next_page, callback=self.parse)
  17.  
  18. def parse_post(self, response):
  19. # Scrape title, content from post.
  20. for post in response.xpath('//li[contains(@class,"s-result-item celwidget")]'):
  21. item = dict()
  22. item['Name'] = post.xpath('.//h2[contains(@class,"a-size-base s-inline s-access-title a-text-normal")]/text()').extract()
  23. item['Price'] = post.xpath('.//span[contains(@class,"a-size-base a-color-price s-price a-text-bold")]/text()').extract()
  24. item['Image'] = post.xpath('.//img[contains(@class,"s-access-image cfMarker")]/@src').extract()
  25. item['Link'] = post.xpath('.//a[contains(@class,"a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal")]/@href').extract()
  26. yield item
  27.  
  28. # If the post page has a link to next page keep parsing.
  29. next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first()
  30. if next_page:
  31. yield scrapy.Request(next_page, callback=self.parse_post)
Add Comment
Please, Sign In to add comment