Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.http import Request
- from diploma.items import Post
- class DjinniSpider(scrapy.Spider):
- name = 'djinni'
- start_urls = ['https://djinni.co/login']
- def parse(self, response):
- return scrapy.FormRequest.from_response(
- response,
- formdata={'email': 'artur@chimplie.com', 'password': 'artur4ik'},
- callback=self.after_login
- )
- def after_login(self, response):
- # Go to page with selected filters
- language = self.language
- url = response.request.url
- if language:
- url = url + f'?title={language}'
- yield scrapy.Request(url=url, callback=self.scrap)
- def scrap_post(self, response):
- title = response.css('h1::text').extract_first().strip()
- salary = response.css('div.main-profile-details span::text').extract_first()[1:]
- profile = '\n'.join(response.css('p.profile::text').extract())
- names = response.css('table.skills-table tr div.skill::attr(id)').extract()
- values = response.css('table.skills-table tr div.skill::attr(data-score)').extract()
- skills = ''
- for i, name in enumerate(names):
- skills += f'{name} - {values[i]};'
- post = Post(
- title=title.replace('\'', ''),
- salary=salary.replace('\'', ''),
- profile=profile.replace('\'', ''),
- skills=skills.replace('\'', ''),
- )
- yield post
- def scrap(self, response):
- if not self.limit == 'None':
- counter = response.meta.get('counter') or 0
- if counter >= int(self.limit):
- return
- posts = response.css('div.searchresults h4 a::attr(href)')
- for post in posts:
- if not self.limit == 'None':
- counter += 1
- if counter >= int(self.limit):
- return
- yield response.follow(post, callback=self.scrap_post)
- next_page = response.css('li.next a::attr(href)').extract_first()
- if next_page is not None:
- if not self.limit == 'None':
- yield response.follow(next_page, callback=self.scrap, meta={'counter': counter})
- else:
- yield response.follow(next_page, callback=self.scrap)
- # Get titles from each ad
- # for post in posts:
- # header = post.css('h4 a.profile::text').extract_first()
- # postItem = Post(title=header.strip())
- # print(header.strip())
- # yield {
- # 'title': header.strip()
- # }
- # counter += 1
- # # move to the next page and repeat
- # next_page = response.css('li.next a::attr(href)').extract_first()
- # if next_page is not None:
- # yield response.follow(next_page, callback=self.scrap, meta={'counter': counter})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement