Advertisement
Guest User

scraper methods

a guest
Nov 13th, 2020
34
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.74 KB | None | 0 0
  1. def parse_reports(self, response):
  2.         report_dict={}
  3.         for frame in response.xpath('//div[re:test(@class,"schedule__cell schedule__cell--talk col-\d-\d-\d")]'):
  4.             try:
  5.                 report_link = ''.join(('https://2019.jokerconf.com',frame.xpath('.//a[@class="link schedule__link"]/@href').get(),'/'))
  6.                 if ('bof' in report_link) or ('party' in report_link):
  7.                     continue
  8.                 complexity_text = frame.xpath('.//div[@class="schedule__helper"]//img/@title').get()
  9.                 report_dict['complexity'] = {'value':self.COMPLEXITY_VALUES[complexity_text], 'name':complexity_text}
  10.                 material_links = [self.materials_dict_form(material_link) for material_link in frame.xpath('.//a/@href').getall()]
  11.                 print(material_links)
  12.                 report_dict['source'] = material_links
  13.                 report_dict['tags'] = [tag.strip()[1::] for tag in frame.xpath('.//i[@class="schedule__tags"]//nobr/text()').getall()]
  14.                 yield Request(
  15.                             report_link,
  16.                             callback=self.parse_authors,
  17.                             meta={'report_dict':report_dict}
  18.                             )
  19.             except TypeError as typeErr:
  20.                 print(f'exception {typeErr} raised')
  21.                 continue
  22.  
  23.     def parse_authors(self, response):
  24.         report = Reports()
  25.         report['complexity'] = response.meta['report_dict']['complexity']
  26.         report['source'] = response.meta['report_dict']['source']
  27.         report['tags'] = response.meta['report_dict']['tags']
  28.         report['title'] = response.xpath('//h1[@class="talk_title"]/text()').get()
  29.         report['description'] = response.xpath('//main[@class="talk-main"]//p/text()').get()
  30.         speakers_list=[]
  31.         for speaker_sec in response.xpath('//div[@class="talk-speaker"]'):
  32.             speaker = Speakers()
  33.             contact_info = ContactInfo()
  34.             speaker['name'] = speaker_sec.xpath('.//h5[@class="speaker-info_name"]/text()').get()
  35.             speaker['avatar'] = speaker_sec.xpath('.//img[@class="img-fluid"]/@src').get()
  36.             speaker['bio'] = speaker_sec.xpath('.//div[@class="speaker-info_bio"]//p/text()').get()
  37.             contact_info['company'] = (speaker_sec.xpath('.//h6[@class="speaker-info_company"]/text()').get(),str(*re.findall(r'\d\d\d\d',self.conference['name'])))
  38.             contact_info['twitterUsername'] =  speaker_sec.xpath('.//div[@class="speaker_profiles"]//a[@class="twitter_link"]/@href').get()
  39.             speaker['contactInfo']=contact_info
  40.             speakers_list.append(speaker)
  41.         report['speakers'] = speakers_list
  42.         self.conference['report'] = report
  43.         yield self.conference
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement