Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from scrapy.selector import HtmlXPathSelector
- from items import TripadvisorItem, Review,SpecialRatings
- import re
- import codecs
- class tripAdvisorSpider(CrawlSpider):
- name = "tripAdvisorSpider"
- allowed_domains = ['tripadvisor.com']
- start_urls = []
- f = open('start_urls.txt', "r")
- for line in f:
- if len(line) > 0:
- start_urls.append(line)
- # use the first rule that succeed to apply
- rules = (
- Rule (SgmlLinkExtractor(allow=("ShowUserReviews-g.*",), restrict_xpaths=('//*[@id="REVIEWS"]/div[4]/div/div[2]/div/div/div[1]/a',), unique=True), callback='parse_item', follow= True),
- Rule(SgmlLinkExtractor(allow=("ShowUserReviews-g.*",),restrict_xpaths=('//*[@id="REVIEWS"]/div[contains(@class,"deckTools btm")]',),unique=True),callback='parse_item',follow=True),
- )
- def parse_item(self, response):
- hxs = HtmlXPathSelector(response)
- item = TripadvisorItem()
- item['url'] = response.url.encode('ascii', errors='ignore')
- item['state'] = hxs.xpath('//*[@id="PAGE"]/div[2]/div[1]/ul/li[2]/a/span/text()').extract()[0].encode('ascii', errors='ignore')
- if(item['state']==[]):
- item['state']=hxs.xpath('//*[@id="HEADING_GROUP"]/div[2]/address/span/span/span[contains(@class,"region_title")][2]/text()').extract()
- item['city'] = hxs.select('//*[@id="PAGE"]/div[2]/div[1]/ul/li[3]/a/span/text()').extract()
- if(item['city']==[]):
- item['city'] =hxs.xpath('//*[@id="HEADING_GROUP"]/div[2]/address/span/span/span[1]/span/text()').extract()
- if(item['city']==[]):
- item['city']=hxs.xpath('//*[@id="HEADING_GROUP"]/div[2]/address/span/span/span[3]/span/text()').extract()
- item['city']= item['city'][0].encode('ascii', errors='ignore')
- item['hotelName'] = hxs.xpath('//*[@id="HEADING"]/span[2]/span/a/text()').extract()
- item['hotelName']=item['hotelName'][0].encode('ascii', errors='ignore')
- reviews = hxs.select('.//div[contains(@id, "review")]')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement