Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import scrapy
- from tutor_job_spy.items import TutorJobSpyItem
- class Spyspider(scrapy.Spider):
- name = 'spy'
- #for privacy reasons I delete the url information :)
- allowed_domains = ['']
- url_0 = ''
- start_urls = [url_0, ]
- base_url = ''
- list_previous = []
- list_present = []
- def parse(self, response):
- numbers = response.xpath( '//tr[@bgcolor="#d7ecff" or @bgcolor="#eef7ff"]/td[@width="8%" and @height="40"]/span/text()').extract()
- self.list_previous = numbers
- self.list_present = numbers
- yield scrapy.Request(self.url_0, self.keep_spying)
- def keep_spying(self, response):
- numbers = response.xpath('//tr[@bgcolor="#d7ecff" or @bgcolor="#eef7ff"]/td[@width="8%" and @height="40"]/span/text()').extract()
- self.list_previous = self.list_present
- self.list_present = numbers
- # judge if anything new
- if (self.list_present != self.list_previous):
- self.goto_new_demand(response)
- #time.sleep(60) #from cache
- yield scrapy.Request(self.url_0, self.keep_spying, dont_filter=True)
- def goto_new_demand(self, response):
- detail_links = response.xpath('//div[@class="ShowDetail"]/a/@href').extract()
- yield scrapy.Request(self.base_url + detail_links[0], self.get_new_demand)
- def get_new_demand(self, response):
- new_demand = TutorJobSpyItem()
- new_demand['url'] = response.url
- requirments = response.xpath('//tr[@#bgcolor="#eef7ff"]/td[@colspan="2"]/div/text()').extract()[0]
- new_demand['gender'] = self.get_gender(requirments)
- new_demand['region'] = response.xpath('//tr[@bgcolor="#d7ecff"]/td[@align="left"]/text()').extract()[5]
- new_demand['grade'] = response.xpath('//tr[@bgcolor="#d7ecff"]/td[@align="left"]/text()').extract()[7]
- new_demand['subject'] = response.xpath('//tr[@bgcolor="#eef7ff"]/td[@align="left"]/text()').extract()[2]
- return new_demand
- def get_gender(self, requirments):
- if ('女老师' in requirments):
- return 'F'
- elif ('男老师' in requirments):
- return 'M'
- else:
- return 'Both okay'
- if (self.list_present != self.list_previous):
- self.goto_new_demand(response)
Add Comment
Please, Sign In to add comment