Advertisement
Guest User

Untitled

a guest
Oct 21st, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.45 KB | None | 0 0
  1. import scrapy
  2.  
  3. class BlogSpider(scrapy.Spider):
  4.     name = 'blogspider'
  5.     start_urls = ['http://www2.congreso.gob.pe/Sicr/TraDocEstProc/CLProLey2016.nsf/Local%20Por%20Numero%20Inverso?OpenView']
  6.  
  7.     def parse(self, response):
  8. #        for title in response.('.post-header>h2'):
  9. #            yield {'title': title.css('a ::text').extract_first()}
  10.         links = response.xpath('body/form/table[2]/tr/td[1]/font/a/@href')
  11.         #yield response.follow(links[1], self.parseLaw)
  12.         for next_page in links:
  13.             yield response.follow(next_page, self.parseLaw)
  14.     def getMail(self, s):
  15.         return s.split(':')[1]
  16.     def parseLaw(self, response):
  17.         fecha = response.xpath('body/form/table/tr[1]/td/table/tr[3]/td[2]/font[1]/text()').extract()
  18.         codigo = response.xpath('body/form/table/tr[1]/td/table/tr[4]/td[2]/font[1]/text()').extract()
  19.         titulo = response.xpath('body/form/table/tr[1]/td/table/tr[7]/td[2]/font/text()').extract()
  20.         sumilla = response.xpath('body/form/table/tr[1]/td/table/tr[8]/td[2]/font/text()').extract()
  21.         correos = list(map(self.getMail,response.xpath('body/form/table/tr[1]/td/table/tr[9]/td[2]/font/a/@href').extract()))
  22.         autores = response.xpath('body/form/table/tr[1]/td/table/tr[9]/td[2]/font/a/text()').extract()
  23.         print (fecha)
  24.         print (codigo)
  25.         print (titulo)
  26.         print (sumilla)
  27.         print (correos)
  28.         print (autores)
  29.         yield None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement