Untitled

from scrapy import Request
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from property.items import PropertyItem
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html

reload(sys)
sys.setdefaultencoding('utf8')	#To prevent UnicodeDecodeError, UnicodeEncodeError.

# def print_values(item):
# 	print('\ntitle ' + item['title'])
# 	print('\nurl ' + item['url'])
# 	print('\ndescription ' + item['description'])
# 	print('\nproperty type ' + str(item['property_type']))
	# print('\nad type ' + str(item['ad_type']))
	# print('\nprice ' + str(item['price']))
	# print('\nlocation ' + str(item['location']))
	# print('\nbedrooms ' + str(item['bedrooms']))
	# print('\nuser ' + str(item['user']))
	# print('\ndate ' + str(item['date']))

entry_urls = []

def scrape(url, lxml):
	print('Scraping %s %s.' % (url, str(lxml)))
	print('\nProperty type: ' + str(lxml.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()[0]')))
	print('\nPrice: ' + str(lxml.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]//tr[1]/td[2]/text()[0]')))

class VivastreetSpider(CrawlSpider):
	name = 'viva'
	allowed_domains = ['chennai.vivastreet.co.in']
	start_urls = ['http://chennai.vivastreet.co.in/rent+chennai/']
	rules = [
	 	Rule(LinkExtractor(restrict_xpaths = '//*[text()[contains(., "Next")]]'), callback = 'parse_start_url', follow = True)
	 	]	#To handle pagination.

	def closed(self, reason):
	 	print('Closing spider with %d URL(s) fetched.' % len(entry_urls))
	 	r = Render(entry_urls, scrape)

	def parse_start_url(self, response):
		urls = Selector(response).xpath('//a[contains(@id, "vs-detail-link")]/@href').extract()
		entry_urls.extend(urls)
		print(str(len(urls)) + ' entries scraped. Total %d URL(s) fetched.' % len(entry_urls))

class Render(QWebPage):
	def __init__(self, urls, cb):
		self.app = QApplication(sys.argv)
		QWebPage.__init__(self)
		self.loadFinished.connect(self._loadFinished)
		self.urls = urls
		self.cb = cb
		self.crawl()
		self.app.exec_()

	def crawl(self):
		if self.urls:
			url = self.urls.pop(0)
			self.mainFrame().load(QUrl(url))
			print('Downloaded ' + url)
		else:
			self.app.quit()

	def _loadFinished(self, result):
		self.frame = self.mainFrame()
		url = str(self.frame.url().toString())
		result = self.frame.toHtml()
		lxml = html.fromstring(str(result.toAscii()))
		self.cb(url, lxml)
		self.crawl()


	# def parse_link(self, response):
		# item = PropertyItem()
		# item['title'] = Selector(response).xpath('//h1[@class = "kiwii-font-xlarge kiwii-margin-none"]/text()').extract()[0]
		# item['url'] = response.url
		# item['description'] = Selector(response).xpath('//div[@class = "shortdescription"]/text()').extract()[0]
		# item['property_type'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()').extract()
		# item['ad_type'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[3]/td[2]/text()').extract()
		# item['price'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[1]/td[2]/text()').extract()
		# item['location'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[2]/td[2]/text()').extract()
		# item['bedrooms'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[5]/td[2]/text()').extract()

		# tree = get_xpath(response.url)

		# item['property_type'] = tree.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()')

		# item['user'] = Selector(response).xpath('//span[@class = "kiwii-xxdark-gray"]/text()').extract()[0].split()[2]
		# item['date'] = Selector(response).xpath('//span[@class = "kiwii-xxdark-gray"]/text()').extract()[0].split()[4]

		# print_values(item)