Guest User

Untitled

a guest
Jan 21st, 2018
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.25 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from tutor_job_spy.items import TutorJobSpyItem
  4.  
  5. class Spyspider(scrapy.Spider):
  6. name = 'spy'
  7. #for privacy reasons I delete the url information :)
  8. allowed_domains = ['']
  9. url_0 = ''
  10.  
  11. start_urls = [url_0, ]
  12. base_url = ''
  13. list_previous = []
  14. list_present = []
  15.  
  16. def parse(self, response):
  17. numbers = response.xpath( '//tr[@bgcolor="#d7ecff" or @bgcolor="#eef7ff"]/td[@width="8%" and @height="40"]/span/text()').extract()
  18. self.list_previous = numbers
  19. self.list_present = numbers
  20.  
  21. yield scrapy.Request(self.url_0, self.keep_spying)
  22.  
  23. def keep_spying(self, response):
  24. numbers = response.xpath('//tr[@bgcolor="#d7ecff" or @bgcolor="#eef7ff"]/td[@width="8%" and @height="40"]/span/text()').extract()
  25.  
  26. self.list_previous = self.list_present
  27. self.list_present = numbers
  28.  
  29. # judge if anything new
  30. if (self.list_present != self.list_previous):
  31. self.goto_new_demand(response)
  32. #time.sleep(60) #from cache
  33. yield scrapy.Request(self.url_0, self.keep_spying, dont_filter=True)
  34.  
  35. def goto_new_demand(self, response):
  36. detail_links = response.xpath('//div[@class="ShowDetail"]/a/@href').extract()
  37. yield scrapy.Request(self.base_url + detail_links[0], self.get_new_demand)
  38.  
  39.  
  40. def get_new_demand(self, response):
  41. new_demand = TutorJobSpyItem()
  42. new_demand['url'] = response.url
  43. requirments = response.xpath('//tr[@#bgcolor="#eef7ff"]/td[@colspan="2"]/div/text()').extract()[0]
  44. new_demand['gender'] = self.get_gender(requirments)
  45. new_demand['region'] = response.xpath('//tr[@bgcolor="#d7ecff"]/td[@align="left"]/text()').extract()[5]
  46. new_demand['grade'] = response.xpath('//tr[@bgcolor="#d7ecff"]/td[@align="left"]/text()').extract()[7]
  47. new_demand['subject'] = response.xpath('//tr[@bgcolor="#eef7ff"]/td[@align="left"]/text()').extract()[2]
  48. return new_demand
  49.  
  50. def get_gender(self, requirments):
  51. if ('女老师' in requirments):
  52. return 'F'
  53. elif ('男老师' in requirments):
  54. return 'M'
  55. else:
  56. return 'Both okay'
  57.  
  58. if (self.list_present != self.list_previous):
  59. self.goto_new_demand(response)
Add Comment
Please, Sign In to add comment