Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import scrapy
- class BrickSetSpider(scrapy.Spider):
- name = "osta_spider" # just a name for the spider.
- COUNTER = 0
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
- }
- url = "https://www.osta.ee/kategooria/arvutid/lauaarvutid"
- def start_requests(self):
- # a list of URLs that you start to crawl from. We'll start with one URL.
- yield scrapy.http.Request(self.url, headers=self.headers)
- def parse(self, response):
- """
- Weโll use CSS selectors for now since CSS is the easier option and a
- perfect fit for finding all the sets on the page. If you look at the
- HTML for the page, you'll see that each set is specified with the
- class set. Since we're looking for a class, we'd use .set for our
- CSS selector. All we have to do is pass that selector into the
- response object
- """
- MAIN_CONTAINER_SELECTOR = '.main-content__section-helper'
- container = response.css(MAIN_CONTAINER_SELECTOR)
- THUMB_CONTENT_SELECTOR = '.offer-thumb__content'
- THUMB_IMAGE_SELECTOR = '.offer-thumb__image'
- for thumb_content, thumb_image in zip(container.css(THUMB_CONTENT_SELECTOR),
- container.css(THUMB_IMAGE_SELECTOR)):
- """The brickset object weโre looping over has its own css method,
- so we can pass in a selector to locate child elements.
- """
- TITLE_SELECTOR = 'h3 ::attr(title)'
- PRICE_SELECTOR = '.offer-thumb__price--current ::text'
- IMAGE_SELECTOR = 'img ::attr(data-original)'
- name = thumb_content.css(TITLE_SELECTOR).extract_first()
- price = thumb_content.css(PRICE_SELECTOR).extract_first()
- image = thumb_image.css(IMAGE_SELECTOR).extract_first()
- data = {
- 'ID': self.COUNTER,
- 'name': name,
- 'price': price.strip() if price else price,
- 'image': image
- }
- yield data
- self.COUNTER += 1
- NEXT_PAGE_SELECTOR = 'a.icon.next.page-link ::attr(href)'
- next_page =response.css(NEXT_PAGE_SELECTOR).extract_first()
- if next_page:
- url = response.urljoin("https://www.osta.ee/" + next_page)
- yield scrapy.Request(url, self.parse, headers=self.headers)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement