Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.contrib.loader import ItemLoader
- from scrapy.contrib.spiders import CrawlSpider,Rule
- from scrapy.selector import XmlXPathSelector
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from cancerstories.items import CancerstoriesItem
- class LungcancerSpider(CrawlSpider):
- name = "lungcancer"
- allowed_domains = ["coloncancercoalition.org"]
- start_urls = (
- 'http://www.coloncancercoalition.org/community/stories/survivor-stories/',
- )
- rules = (
- Rule(SgmlLinkExtractor(allow=[r'http://www.coloncancercoalition.org/d+/d+/d+/w+']),callback='parse_page',follow=True),
- )
- def parse_page(self, response):
- Li = ItemLoader(item=CancerstoriesItem(),response=response)
- Li.add_xpath('name', '/html/body/div[4]/div[1]/div[1]/div/h1/text()')
- Li.add_xpath('story','//../div/div/p/text()')
- yield Li.load_item()
- pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement