Advertisement
Guest User

Untitled

a guest
Dec 30th, 2012
280
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.70 KB | None | 0 0
  1. from scrapy.selector import HtmlXPathSelector
  2. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  3. from scrapy.contrib.spiders import CrawlSpider, Rule
  4. from messer.items import MesserItem
  5.  
  6. class JustcrawlSpider(CrawlSpider):
  7.     name = 'justcrawl'
  8.     allowed_domains = ['www.professormesser.com']
  9.     start_urls = ['http://www.professormesser.com/n10-005/free-network-plus/']
  10.  
  11.     rules = (
  12.         Rule(SgmlLinkExtractor(restrict_xpaths=('//table/tr/td/div')), callback='parse_item'),
  13.     )
  14.  
  15.     def parse_item(self, response):
  16.         hxs = HtmlXPathSelector(response)
  17.         i = MesserItem()
  18.         i['youtube'] = hxs.select('//div/center/iframe/@src').extract()
  19.         i['name'] = hxs.select('//h1/a/text()').extract()
  20.         return i
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement