Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import re
- import scrapy
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.contrib.linkextractors import LinkExtractor
- from crawler.items import CrawlerItem
- RE_SPACE = re.compile('\s+', re.M)
- class LpfSpider(CrawlSpider):
- name = "lpf"
- allowed_domains = ["site.com"]
- start_urls = (
- 'names have been changes to protect the innocent.com',
- )
- rules = (
- Rule(LinkExtractor(allow=('first-page-url(-\d{1,2})*\.html', ),
- deny=('#post', )), callback='parse_item', follow=True),
- )
- def parse_item(self, response):
- item = CrawlerItem()
- item['page'] = self.strip(response.xpath('//div[@class="nav"]/text()').extract())
- return item
- def strip(self, lst):
- return [self.normalize_space(item) for item in lst if item.strip()]
- def normalize_space(self, text):
- return RE_SPACE.sub(' ', text).strip()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement