Advertisement
Guest User

Untitled

a guest
Feb 7th, 2016
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.89 KB | None | 0 0
  1. import scrapy
  2. from scrapy.contrib.loader import ItemLoader
  3. from scrapy.contrib.spiders import CrawlSpider,Rule
  4. from scrapy.selector import XmlXPathSelector
  5. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  6. from cancerstories.items import CancerstoriesItem
  7.  
  8. class LungcancerSpider(CrawlSpider):
  9. name = "lungcancer"
  10. allowed_domains = ["coloncancercoalition.org"]
  11. start_urls = (
  12. 'http://www.coloncancercoalition.org/community/stories/survivor-stories/',
  13. )
  14. rules = (
  15. Rule(SgmlLinkExtractor(allow=[r'http://www.coloncancercoalition.org/d+/d+/d+/w+']),callback='parse_page',follow=True),
  16. )
  17.  
  18. def parse_page(self, response):
  19. Li = ItemLoader(item=CancerstoriesItem(),response=response)
  20. Li.add_xpath('name', '/html/body/div[4]/div[1]/div[1]/div/h1/text()')
  21. Li.add_xpath('story','//../div/div/p/text()')
  22.  
  23. yield Li.load_item()
  24. pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement