Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy import Request
- from scrapy.selector import Selector
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import Rule, CrawlSpider
- from property.items import PropertyItem
- import sys
- from PyQt4.QtGui import *
- from PyQt4.QtCore import *
- from PyQt4.QtWebKit import *
- from lxml import html
- reload(sys)
- sys.setdefaultencoding('utf8') #To prevent UnicodeDecodeError, UnicodeEncodeError.
- # def print_values(item):
- # print('\ntitle ' + item['title'])
- # print('\nurl ' + item['url'])
- # print('\ndescription ' + item['description'])
- # print('\nproperty type ' + str(item['property_type']))
- # print('\nad type ' + str(item['ad_type']))
- # print('\nprice ' + str(item['price']))
- # print('\nlocation ' + str(item['location']))
- # print('\nbedrooms ' + str(item['bedrooms']))
- # print('\nuser ' + str(item['user']))
- # print('\ndate ' + str(item['date']))
- entry_urls = []
- def scrape(url, lxml):
- print('Scraping %s %s.' % (url, str(lxml)))
- print('\nProperty type: ' + str(lxml.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()[0]')))
- print('\nPrice: ' + str(lxml.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]//tr[1]/td[2]/text()[0]')))
- class VivastreetSpider(CrawlSpider):
- name = 'viva'
- allowed_domains = ['chennai.vivastreet.co.in']
- start_urls = ['http://chennai.vivastreet.co.in/rent+chennai/']
- rules = [
- Rule(LinkExtractor(restrict_xpaths = '//*[text()[contains(., "Next")]]'), callback = 'parse_start_url', follow = True)
- ] #To handle pagination.
- def closed(self, reason):
- print('Closing spider with %d URL(s) fetched.' % len(entry_urls))
- r = Render(entry_urls, scrape)
- def parse_start_url(self, response):
- urls = Selector(response).xpath('//a[contains(@id, "vs-detail-link")]/@href').extract()
- entry_urls.extend(urls)
- print(str(len(urls)) + ' entries scraped. Total %d URL(s) fetched.' % len(entry_urls))
- class Render(QWebPage):
- def __init__(self, urls, cb):
- self.app = QApplication(sys.argv)
- QWebPage.__init__(self)
- self.loadFinished.connect(self._loadFinished)
- self.urls = urls
- self.cb = cb
- self.crawl()
- self.app.exec_()
- def crawl(self):
- if self.urls:
- url = self.urls.pop(0)
- self.mainFrame().load(QUrl(url))
- print('Downloaded ' + url)
- else:
- self.app.quit()
- def _loadFinished(self, result):
- self.frame = self.mainFrame()
- url = str(self.frame.url().toString())
- result = self.frame.toHtml()
- lxml = html.fromstring(str(result.toAscii()))
- self.cb(url, lxml)
- self.crawl()
- # def parse_link(self, response):
- # item = PropertyItem()
- # item['title'] = Selector(response).xpath('//h1[@class = "kiwii-font-xlarge kiwii-margin-none"]/text()').extract()[0]
- # item['url'] = response.url
- # item['description'] = Selector(response).xpath('//div[@class = "shortdescription"]/text()').extract()[0]
- # item['property_type'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()').extract()
- # item['ad_type'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[3]/td[2]/text()').extract()
- # item['price'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[1]/td[2]/text()').extract()
- # item['location'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[2]/td[2]/text()').extract()
- # item['bedrooms'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[5]/td[2]/text()').extract()
- # tree = get_xpath(response.url)
- # item['property_type'] = tree.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()')
- # item['user'] = Selector(response).xpath('//span[@class = "kiwii-xxdark-gray"]/text()').extract()[0].split()[2]
- # item['date'] = Selector(response).xpath('//span[@class = "kiwii-xxdark-gray"]/text()').extract()[0].split()[4]
- # print_values(item)
Add Comment
Please, Sign In to add comment