Advertisement
Guest User

Untitled

a guest
Jun 15th, 2014
415
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.46 KB | None | 0 0
  1. from scrapy.spider import Spider
  2. from scrapy.selector import Selector
  3. from nettuts.items import NettutsItem
  4. from scrapy.http import Request
  5. import re
  6.  
  7. class MySpider(Spider):
  8.     name = "nettuts"
  9.     allowed_domains = ["http://code.tutsplus.com/"]
  10.     start_urls = ["http://code.tutsplus.com/"]
  11.  
  12.     def parse(self, response):
  13.         sel = Selector(response)
  14.         links = sel.xpath("//nav[@class='pagination']/a/@href").extract()
  15.  
  16.         # Store already crawled links
  17.         crawledLinks = []
  18.  
  19.         #Pattern to check proper links
  20.         #linkPattern = re.compile("^(?:ftp|http|https)\:\/\/(?:code)\.(?:tutsplus)\.(?:com)\/(?:posts)\?(?:page)\=\d+\&\w+\=\w+$")
  21.         linkPattern = re.compile("^(?:ftp|http|https)\:\/\/(?:code)\.(?:tutsplus)\.(?:com)\/(?:posts)\?(?:page)\=\d+$")
  22.         #linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$")
  23.  
  24.         for link in links:
  25.             link = "http://code.tutsplus.com" + link # add pre url string
  26.  
  27.             # If it is proper links and not checked before, yield it to the Spider
  28.             if linkPattern.match(link) and not link in crawledLinks:
  29.                 print "Meet\n\n\n"
  30.                 crawledLinks.append(link)
  31.                 yield Request(link, self.parse) # Will this interrupt current page to be scraped?
  32.  
  33.         titles = sel.xpath("//a[@class='posts__post-title']/text()").extract()
  34.         for title in titles:
  35.             item = NettutsItem()
  36.             item["title"] = title
  37.             yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement