Advertisement
Guest User

Untitled

a guest
Oct 26th, 2016
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.05 KB | None | 0 0
  1. from scrapy import Spider
  2. import scrapy
  3. from scrapy.selector import Selector
  4.  
  5. from stack.items import StackItem
  6.  
  7.  
  8. class StackSpider(Spider):
  9. name = "stack"
  10. allowed_domains = ["stackoverflow.com"]
  11. start_urls = [
  12. "http://www.aipass.org/documenti%20ufficiali",
  13. ]
  14.  
  15. def parse(self, response):
  16. documents = Selector(response).xpath('//*[@id="node-329"]/div[1]/ul/li')
  17.  
  18. for document in documents:
  19. item = StackItem()
  20. item['title'] = document.xpath('./a/text()').extract()
  21. item['url'] = response.urljoin(document.xpath('./a/@href').extract_first())
  22. if "node" in item['url']:
  23. request = scrapy.Request(item['url'],
  24. callback=self.anchor_page)
  25. request.meta['item'] = item
  26. yield request
  27.  
  28. yield item
  29.  
  30. def anchor_page(self, response):
  31. old_item = response.request.meta['item']
  32. old_item['data'] = response.xpath('.//*[@id="node-7872"]/div[1]/p').extract()[0]
  33. yield old_item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement