Advertisement
Guest User

Untitled

a guest
Jul 1st, 2015
1,370
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.15 KB | None | 0 0
  1. from scrapy.contrib.spiders import CrawlSpider, Rule
  2. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  3. from scrapy.selector import HtmlXPathSelector
  4. from items import TripadvisorItem, Review,SpecialRatings
  5. import re
  6. import codecs
  7.  
  8.  
  9. class tripAdvisorSpider(CrawlSpider):
  10.     name = "tripAdvisorSpider"
  11.     allowed_domains = ['tripadvisor.com']
  12.     start_urls = []
  13.     f = open('start_urls.txt', "r")
  14.     for line in f:
  15.         if len(line) > 0:
  16.             start_urls.append(line)
  17.             # use the first rule that succeed to apply
  18.     rules = (
  19.         Rule (SgmlLinkExtractor(allow=("ShowUserReviews-g.*",), restrict_xpaths=('//*[@id="REVIEWS"]/div[4]/div/div[2]/div/div/div[1]/a',), unique=True), callback='parse_item', follow= True),
  20.  
  21.         Rule(SgmlLinkExtractor(allow=("ShowUserReviews-g.*",),restrict_xpaths=('//*[@id="REVIEWS"]/div[contains(@class,"deckTools btm")]',),unique=True),callback='parse_item',follow=True),
  22.     )
  23.  
  24.     def parse_item(self, response):
  25.         hxs = HtmlXPathSelector(response)
  26.         item = TripadvisorItem()
  27.  
  28.         item['url'] = response.url.encode('ascii', errors='ignore')
  29.  
  30.         item['state'] =  hxs.xpath('//*[@id="PAGE"]/div[2]/div[1]/ul/li[2]/a/span/text()').extract()[0].encode('ascii', errors='ignore')
  31.         if(item['state']==[]):
  32.             item['state']=hxs.xpath('//*[@id="HEADING_GROUP"]/div[2]/address/span/span/span[contains(@class,"region_title")][2]/text()').extract()
  33.  
  34.         item['city'] =  hxs.select('//*[@id="PAGE"]/div[2]/div[1]/ul/li[3]/a/span/text()').extract()
  35.         if(item['city']==[]):
  36.             item['city'] =hxs.xpath('//*[@id="HEADING_GROUP"]/div[2]/address/span/span/span[1]/span/text()').extract()
  37.         if(item['city']==[]):
  38.           item['city']=hxs.xpath('//*[@id="HEADING_GROUP"]/div[2]/address/span/span/span[3]/span/text()').extract()
  39.         item['city']= item['city'][0].encode('ascii', errors='ignore')
  40.  
  41.         item['hotelName'] =  hxs.xpath('//*[@id="HEADING"]/span[2]/span/a/text()').extract()
  42.         item['hotelName']=item['hotelName'][0].encode('ascii', errors='ignore')
  43.  
  44.         reviews = hxs.select('.//div[contains(@id, "review")]')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement