Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy.selector import HtmlXPathSelector
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from messer.items import MesserItem
- class JustcrawlSpider(CrawlSpider):
- name = 'justcrawl'
- allowed_domains = ['www.professormesser.com']
- start_urls = ['http://www.professormesser.com/n10-005/free-network-plus/']
- rules = (
- Rule(SgmlLinkExtractor(restrict_xpaths=('//table/tr/td/div')), callback='parse_item'),
- )
- def parse_item(self, response):
- hxs = HtmlXPathSelector(response)
- i = MesserItem()
- i['youtube'] = hxs.select('//div/center/iframe/@src').extract()
- i['name'] = hxs.select('//h1/a/text()').extract()
- return i
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement