Advertisement
Guest User

Untitled

a guest
Jun 20th, 2018
199
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.89 KB | None | 0 0
  1. import scrapy
  2. from scrapy.http import Request
  3. from diploma.items import Post
  4.  
  5.  
  6. class DjinniSpider(scrapy.Spider):
  7. name = 'djinni'
  8. start_urls = ['https://djinni.co/login']
  9.  
  10. def parse(self, response):
  11. return scrapy.FormRequest.from_response(
  12. response,
  13. formdata={'email': 'artur@chimplie.com', 'password': 'artur4ik'},
  14. callback=self.after_login
  15. )
  16.  
  17. def after_login(self, response):
  18. # Go to page with selected filters
  19. language = self.language
  20. url = response.request.url
  21. if language:
  22. url = url + f'?title={language}'
  23.  
  24. yield scrapy.Request(url=url, callback=self.scrap)
  25.  
  26. def scrap_post(self, response):
  27. title = response.css('h1::text').extract_first().strip()
  28. salary = response.css('div.main-profile-details span::text').extract_first()[1:]
  29. profile = '\n'.join(response.css('p.profile::text').extract())
  30.  
  31. names = response.css('table.skills-table tr div.skill::attr(id)').extract()
  32. values = response.css('table.skills-table tr div.skill::attr(data-score)').extract()
  33. skills = ''
  34. for i, name in enumerate(names):
  35. skills += f'{name} - {values[i]};'
  36.  
  37. post = Post(
  38. title=title.replace('\'', ''),
  39. salary=salary.replace('\'', ''),
  40. profile=profile.replace('\'', ''),
  41. skills=skills.replace('\'', ''),
  42. )
  43.  
  44. yield post
  45.  
  46. def scrap(self, response):
  47. if not self.limit == 'None':
  48. counter = response.meta.get('counter') or 0
  49. if counter >= int(self.limit):
  50. return
  51.  
  52. posts = response.css('div.searchresults h4 a::attr(href)')
  53. for post in posts:
  54. if not self.limit == 'None':
  55. counter += 1
  56. if counter >= int(self.limit):
  57. return
  58.  
  59. yield response.follow(post, callback=self.scrap_post)
  60.  
  61. next_page = response.css('li.next a::attr(href)').extract_first()
  62. if next_page is not None:
  63. if not self.limit == 'None':
  64. yield response.follow(next_page, callback=self.scrap, meta={'counter': counter})
  65. else:
  66. yield response.follow(next_page, callback=self.scrap)
  67.  
  68. # Get titles from each ad
  69. # for post in posts:
  70. # header = post.css('h4 a.profile::text').extract_first()
  71. # postItem = Post(title=header.strip())
  72. # print(header.strip())
  73. # yield {
  74. # 'title': header.strip()
  75. # }
  76. # counter += 1
  77.  
  78. # # move to the next page and repeat
  79. # next_page = response.css('li.next a::attr(href)').extract_first()
  80. # if next_page is not None:
  81. # yield response.follow(next_page, callback=self.scrap, meta={'counter': counter})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement