Advertisement
Guest User

spidey sense

a guest
Mar 16th, 2015
594
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.55 KB | None | 0 0
  1. import scrapy
  2. from tutorial.items import DmozItem
  3. from scrapy.contrib.spiders import CrawlSpider, Rule
  4. from scrapy.contrib.linkextractors import LinkExtractor
  5.  
  6. class TsrSpider(CrawlSpider):
  7.     name = 'tsr'
  8.     allowed_domains = ['thestudentroom.co.uk']
  9.     start_urls = ['http://www.thestudentroom.co.uk/forumdisplay.php?f=143']
  10.    
  11.     download_delay = 4
  12.     user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:35.0) Gecko/20100101 Firefox/35.0'
  13.    
  14.     rules = (
  15.         Rule(
  16.             LinkExtractor(
  17.                 allow=('forumdisplay\.php\?f=143\&page=\d',),
  18.                 restrict_xpaths=("//li[@class='pager-page_numbers']/a/@href",))),
  19.        
  20.         Rule(
  21.             LinkExtractor(
  22.                 allow=('showthread\.php\?t=\d+\&page=\d+',),
  23.                 restrict_xpaths=("//li[@class='pager-page_numbers']/a/@href",)),
  24.             callback='parse_link'),
  25.  
  26.         Rule(
  27.             LinkExtractor(
  28.                 allow=('showthread\.php\?t=\d+',),
  29.                 restrict_xpaths=("//tr[@class='thread  unread    ']",)),
  30.             callback='parse_link'),
  31.         )
  32.  
  33.     def parse_link(self, response):
  34. #           Iterate over posts.    
  35.         for sel in response.xpath("//li[@class='post threadpost old   ']"):
  36.             rating = sel.xpath(
  37.             "div[@class='post-footer']//span[@class='score']/text()").extract()
  38.             if not rating:
  39.                 rating = 0
  40.             else:
  41.                 rating = rating[0]
  42.             item = DmozItem()
  43.             item['post'] = sel.xpath(
  44.     "div[@class='post-content']/blockquote[@class='postcontent restore']/text()").extract()
  45.             item['link'] = response.url
  46.             item['topic'] = response.xpath(
  47.     "//div[@class='forum-header section-header']/h1/span/text()").extract()
  48.             item['rating'] = rating
  49.             yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement