Advertisement
LightningStalker

Scrapey forum thread crawler

Sep 3rd, 2014
259
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.95 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import scrapy
  4. from scrapy.contrib.spiders import CrawlSpider, Rule
  5. from scrapy.contrib.linkextractors import LinkExtractor
  6.  
  7. from crawler.items import CrawlerItem
  8.  
  9.  
  10. RE_SPACE = re.compile('\s+', re.M)
  11.  
  12.  
  13. class LpfSpider(CrawlSpider):
  14.     name = "lpf"
  15.     allowed_domains = ["site.com"]
  16.     start_urls = (
  17.         'names have been changes to protect the innocent.com',
  18.     )
  19.    
  20.     rules = (
  21.         Rule(LinkExtractor(allow=('first-page-url(-\d{1,2})*\.html', ),
  22.         deny=('#post', )), callback='parse_item', follow=True),
  23.     )
  24.    
  25.     def parse_item(self, response):
  26.         item = CrawlerItem()
  27.         item['page'] = self.strip(response.xpath('//div[@class="nav"]/text()').extract())
  28.         return item
  29.  
  30.     def strip(self, lst):
  31.         return [self.normalize_space(item) for item in lst if item.strip()]
  32.  
  33.     def normalize_space(self, text):
  34.         return RE_SPACE.sub(' ', text).strip()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement