Advertisement
UniQuet0p1

Untitled

May 3rd, 2021
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.46 KB | None | 0 0
  1. import json
  2.  
  3. import scrapy
  4.  
  5.  
  6. class BrickSetSpider(scrapy.Spider):
  7.     name = "osta_spider"  # just a name for the spider.
  8.     COUNTER = 0
  9.     headers = {
  10.         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
  11.     }
  12.     url = "https://www.osta.ee/kategooria/arvutid/lauaarvutid"
  13.  
  14.     def start_requests(self):
  15.         # a list of URLs that you start to crawl from. We'll start with one URL.
  16.  
  17.         yield scrapy.http.Request(self.url, headers=self.headers)
  18.  
  19.     def parse(self, response):
  20.         """
  21.        Weโ€™ll use CSS selectors for now since CSS is the easier option and a
  22.        perfect fit for finding all the sets on the page. If you look at the
  23.        HTML for the page, you'll see that each set is specified with the
  24.        class set. Since we're looking for a class, we'd use .set for our
  25.        CSS selector. All we have to do is pass that selector into the
  26.        response object
  27.        """
  28.         MAIN_CONTAINER_SELECTOR = '.main-content__section-helper'
  29.         container = response.css(MAIN_CONTAINER_SELECTOR)
  30.         THUMB_CONTENT_SELECTOR = '.offer-thumb__content'
  31.         THUMB_IMAGE_SELECTOR = '.offer-thumb__image'
  32.  
  33.  
  34.         for thumb_content, thumb_image in zip(container.css(THUMB_CONTENT_SELECTOR),
  35.                                               container.css(THUMB_IMAGE_SELECTOR)):
  36.             """The brickset object weโ€™re looping over has its own css method,
  37.            so we can pass in a selector to locate child elements.
  38.            """
  39.             TITLE_SELECTOR = 'h3 ::attr(title)'
  40.             PRICE_SELECTOR = '.offer-thumb__price--current ::text'
  41.             IMAGE_SELECTOR = 'img ::attr(data-original)'
  42.  
  43.             name = thumb_content.css(TITLE_SELECTOR).extract_first()
  44.             price = thumb_content.css(PRICE_SELECTOR).extract_first()
  45.             image = thumb_image.css(IMAGE_SELECTOR).extract_first()
  46.  
  47.             data = {
  48.                 'ID': self.COUNTER,
  49.                 'name': name,
  50.                 'price': price.strip() if price else price,
  51.                 'image': image
  52.             }
  53.             yield data
  54.  
  55.             self.COUNTER += 1
  56.  
  57.         NEXT_PAGE_SELECTOR = 'a.icon.next.page-link ::attr(href)'
  58.         next_page =response.css(NEXT_PAGE_SELECTOR).extract_first()
  59.  
  60.         if next_page:
  61.             url = response.urljoin("https://www.osta.ee/" + next_page)
  62.             yield scrapy.Request(url, self.parse, headers=self.headers)
  63.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement