Fazlul

Untitled

Jun 13th, 2021
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.68 KB | None | 0 0
  1. import scrapy
  2. from scrapy.exceptions import CloseSpider
  3.  
  4. class AmazonBooksSpider(scrapy.Spider):
  5.  
  6. CLOSESPIDER_PAGECOUNT = 5
  7. name = 'amazon_books'
  8. def start_requests(self):
  9. yield scrapy.Request(
  10. url = 'https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0',
  11.  
  12. callback = self.parse,
  13. meta = {'current_page': 1}
  14. )
  15.  
  16. def parse(self, response):
  17. books = response.xpath('//*[@class="s-include-content-margin s-border-bottom s-latency-cf-section"]')
  18. # if len(books) == 0:
  19. # raise CloseSpider
  20. for book in books:
  21.  
  22. book_name = book.xpath('.//h2[@class="a-size-mini a-spacing-none a-color-base s-line-clamp-2"]/a/span/text()').get(),
  23. book_published_date = book.xpath('.//*[@class="a-size-base a-color-secondary a-text-normal"]/text()').get(),
  24. book_rating = book.xpath('.//*[@class="a-icon-alt"]/text()').get(),
  25. book_reviewers = book.xpath('.//a[@class="a-link-normal"]/span/text()').get(),
  26. book_image_url = book.xpath('.//*[@class="a-section aok-relative s-image-fixed-height"]/img/@src').get()
  27.  
  28. urls = response.xpath('.//h2[@class="a-size-mini a-spacing-none a-color-base s-line-clamp-2"]/a/@href').get()
  29. for url in urls:
  30. yield scrapy.Request(url = response.urljoin(url),
  31. callback = self.parse_item,
  32. cb_kwargs = {
  33. 'name':book_name,
  34. 'date': book_published_date,
  35. 'rating': book_rating,
  36. 'reviewers': book_reviewers,
  37. 'image_url': book_image_url
  38. })
  39. def parse_item(self, response, name,date,rating,reviewers,image_url):
  40. book_author = response.xpath('//*[@class="author notFaded"]/span/a/text()').get(),
  41. book_hardcover_sell_price = response.xpath('(//*[@class="slot-price"]/span)[4]/text()').get()
  42. book_kindle_sell_price = response.xpath('(//*[@class="slot-price"]/span)[1]/text()').get(),
  43. book_audio_price = response.xpath('(//*[@class="slot-price"]/span)[5]/text()').get(),
  44. book_hardcover_actual_price = response.xpath('.//*[@id="listPrice"]/text()').get(),
  45. book_list_price = response.xpath('//*[@id="listPrice"]/text()').get(),
  46. book_saving_amount = response.xpath('//*[@id="savingsAmount"]/text()').get()
  47. yield{
  48. 'book_name':name,
  49. 'book_author':book_author,
  50. 'book_published_date':date,
  51. 'book_hardcover_sell_price': book_hardcover_sell_price,
  52. 'book_kindle_sell_price': book_kindle_sell_price,
  53. 'book_audio_price':book_audio_price,
  54. 'book_list_price': book_list_price,
  55. 'book_saving_amount':book_saving_amount,
  56. 'book_rating':rating,
  57. 'book_reviewers':reviewers,
  58. 'book_url': response.url,
  59. 'book_image_url':image_url
  60. }
  61.  
  62. next_page = response.meta['current_page'] + 1
  63. yield scrapy.Request(
  64. url = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A1%2Cp_n_feature_nine_browse-bin%3A3291437011&dc&fs=true&page='+ str(next_page) +'&qid=1623501355&rnid=3291435011&ref=sr_pg_1',
  65. callback = self.parse_item,
  66. meta = {'current_page':next_page}
  67. )
  68.  
  69.  
Advertisement
Add Comment
Please, Sign In to add comment