Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy import Spider
- import scrapy
- from scrapy.selector import Selector
- from stack.items import StackItem
- class StackSpider(Spider):
- name = "stack"
- allowed_domains = ["stackoverflow.com"]
- start_urls = [
- "http://www.aipass.org/documenti%20ufficiali",
- ]
- def parse(self, response):
- documents = Selector(response).xpath('//*[@id="node-329"]/div[1]/ul/li')
- for document in documents:
- item = StackItem()
- item['title'] = document.xpath('./a/text()').extract()
- item['url'] = response.urljoin(document.xpath('./a/@href').extract_first())
- if "node" in item['url']:
- request = scrapy.Request(item['url'],
- callback=self.anchor_page)
- request.meta['item'] = item
- yield request
- yield item
- def anchor_page(self, response):
- old_item = response.request.meta['item']
- old_item['data'] = response.xpath('.//*[@id="node-7872"]/div[1]/p').extract()[0]
- yield old_item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement