Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.exceptions import CloseSpider
- class AmazonBooksSpider(scrapy.Spider):
- CLOSESPIDER_PAGECOUNT = 5
- name = 'amazon_books'
- def start_requests(self):
- yield scrapy.Request(
- url = 'https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0',
- callback = self.parse,
- meta = {'current_page': 1}
- )
- def parse(self, response):
- books = response.xpath('//*[@class="s-include-content-margin s-border-bottom s-latency-cf-section"]')
- # if len(books) == 0:
- # raise CloseSpider
- for book in books:
- book_name = book.xpath('.//h2[@class="a-size-mini a-spacing-none a-color-base s-line-clamp-2"]/a/span/text()').get(),
- book_published_date = book.xpath('.//*[@class="a-size-base a-color-secondary a-text-normal"]/text()').get(),
- book_rating = book.xpath('.//*[@class="a-icon-alt"]/text()').get(),
- book_reviewers = book.xpath('.//a[@class="a-link-normal"]/span/text()').get(),
- book_image_url = book.xpath('.//*[@class="a-section aok-relative s-image-fixed-height"]/img/@src').get()
- urls = response.xpath('.//h2[@class="a-size-mini a-spacing-none a-color-base s-line-clamp-2"]/a/@href').get()
- for url in urls:
- yield scrapy.Request(url = response.urljoin(url),
- callback = self.parse_item,
- cb_kwargs = {
- 'name':book_name,
- 'date': book_published_date,
- 'rating': book_rating,
- 'reviewers': book_reviewers,
- 'image_url': book_image_url
- })
- def parse_item(self, response, name,date,rating,reviewers,image_url):
- book_author = response.xpath('//*[@class="author notFaded"]/span/a/text()').get(),
- book_hardcover_sell_price = response.xpath('(//*[@class="slot-price"]/span)[4]/text()').get()
- book_kindle_sell_price = response.xpath('(//*[@class="slot-price"]/span)[1]/text()').get(),
- book_audio_price = response.xpath('(//*[@class="slot-price"]/span)[5]/text()').get(),
- book_hardcover_actual_price = response.xpath('.//*[@id="listPrice"]/text()').get(),
- book_list_price = response.xpath('//*[@id="listPrice"]/text()').get(),
- book_saving_amount = response.xpath('//*[@id="savingsAmount"]/text()').get()
- yield{
- 'book_name':name,
- 'book_author':book_author,
- 'book_published_date':date,
- 'book_hardcover_sell_price': book_hardcover_sell_price,
- 'book_kindle_sell_price': book_kindle_sell_price,
- 'book_audio_price':book_audio_price,
- 'book_list_price': book_list_price,
- 'book_saving_amount':book_saving_amount,
- 'book_rating':rating,
- 'book_reviewers':reviewers,
- 'book_url': response.url,
- 'book_image_url':image_url
- }
- next_page = response.meta['current_page'] + 1
- yield scrapy.Request(
- url = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A1%2Cp_n_feature_nine_browse-bin%3A3291437011&dc&fs=true&page='+ str(next_page) +'&qid=1623501355&rnid=3291435011&ref=sr_pg_1',
- callback = self.parse_item,
- meta = {'current_page':next_page}
- )
Advertisement
Add Comment
Please, Sign In to add comment