Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from scrapy.selector import HtmlXPathSelector
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from linkeddata.items import LinkeddataItem
- class NextmediaSpider(CrawlSpider):
- name = 'nextmedia'
- allowed_domains = ['tw.nextmedia.com']
- rules = (
- Rule(SgmlLinkExtractor(allow=r'applenews/article/art_id/'), callback='parse_content', follow=False),
- )
- def parse_content(self, response):
- hxs = HtmlXPathSelector(response)
- summary = hxs.select("//p[@class='summary']/text()").extract()[0]
- self.log("Summary: %s" % summary.strip())
- titles = hxs.select("//h2[@class='article_title']/text()").extract()
- texts = hxs.select("//p[@class='article_text']").extract()
- for i in range(0, len(titles)):
- self.log("Section Title: %s" % titles[i].strip())
- self.log(texts[i])
Add Comment
Please, Sign In to add comment