Guest User

Untitled

a guest
Jul 19th, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.91 KB | None | 0 0
  1. import re
  2.  
  3. from scrapy.selector import HtmlXPathSelector
  4. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  5. from scrapy.contrib.spiders import CrawlSpider, Rule
  6. from linkeddata.items import LinkeddataItem
  7.  
  8. class NextmediaSpider(CrawlSpider):
  9. name = 'nextmedia'
  10. allowed_domains = ['tw.nextmedia.com']
  11.  
  12. rules = (
  13. Rule(SgmlLinkExtractor(allow=r'applenews/article/art_id/'), callback='parse_content', follow=False),
  14. )
  15.  
  16. def parse_content(self, response):
  17.  
  18. hxs = HtmlXPathSelector(response)
  19. summary = hxs.select("//p[@class='summary']/text()").extract()[0]
  20. self.log("Summary: %s" % summary.strip())
  21.  
  22. titles = hxs.select("//h2[@class='article_title']/text()").extract()
  23. texts = hxs.select("//p[@class='article_text']").extract()
  24. for i in range(0, len(titles)):
  25. self.log("Section Title: %s" % titles[i].strip())
  26. self.log(texts[i])
Add Comment
Please, Sign In to add comment