Guest User

Untitled

a guest
Jul 16th, 2016
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.23 KB | None | 0 0
  1. from scrapy import Request
  2. from scrapy.selector import Selector
  3. from scrapy.linkextractors import LinkExtractor
  4. from scrapy.spiders import Rule, CrawlSpider
  5. from property.items import PropertyItem
  6. import sys
  7. from PyQt4.QtGui import *
  8. from PyQt4.QtCore import *
  9. from PyQt4.QtWebKit import *
  10. from lxml import html
  11.  
  12. reload(sys)
  13. sys.setdefaultencoding('utf8') #To prevent UnicodeDecodeError, UnicodeEncodeError.
  14.  
  15. # def print_values(item):
  16. # print('\ntitle ' + item['title'])
  17. # print('\nurl ' + item['url'])
  18. # print('\ndescription ' + item['description'])
  19. # print('\nproperty type ' + str(item['property_type']))
  20. # print('\nad type ' + str(item['ad_type']))
  21. # print('\nprice ' + str(item['price']))
  22. # print('\nlocation ' + str(item['location']))
  23. # print('\nbedrooms ' + str(item['bedrooms']))
  24. # print('\nuser ' + str(item['user']))
  25. # print('\ndate ' + str(item['date']))
  26.  
  27. entry_urls = []
  28.  
  29. def scrape(url, lxml):
  30. print('Scraping %s %s.' % (url, str(lxml)))
  31. print('\nProperty type: ' + str(lxml.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()[0]')))
  32. print('\nPrice: ' + str(lxml.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]//tr[1]/td[2]/text()[0]')))
  33.  
  34. class VivastreetSpider(CrawlSpider):
  35. name = 'viva'
  36. allowed_domains = ['chennai.vivastreet.co.in']
  37. start_urls = ['http://chennai.vivastreet.co.in/rent+chennai/']
  38. rules = [
  39. Rule(LinkExtractor(restrict_xpaths = '//*[text()[contains(., "Next")]]'), callback = 'parse_start_url', follow = True)
  40. ] #To handle pagination.
  41.  
  42. def closed(self, reason):
  43. print('Closing spider with %d URL(s) fetched.' % len(entry_urls))
  44. r = Render(entry_urls, scrape)
  45.  
  46. def parse_start_url(self, response):
  47. urls = Selector(response).xpath('//a[contains(@id, "vs-detail-link")]/@href').extract()
  48. entry_urls.extend(urls)
  49. print(str(len(urls)) + ' entries scraped. Total %d URL(s) fetched.' % len(entry_urls))
  50.  
  51. class Render(QWebPage):
  52. def __init__(self, urls, cb):
  53. self.app = QApplication(sys.argv)
  54. QWebPage.__init__(self)
  55. self.loadFinished.connect(self._loadFinished)
  56. self.urls = urls
  57. self.cb = cb
  58. self.crawl()
  59. self.app.exec_()
  60.  
  61. def crawl(self):
  62. if self.urls:
  63. url = self.urls.pop(0)
  64. self.mainFrame().load(QUrl(url))
  65. print('Downloaded ' + url)
  66. else:
  67. self.app.quit()
  68.  
  69. def _loadFinished(self, result):
  70. self.frame = self.mainFrame()
  71. url = str(self.frame.url().toString())
  72. result = self.frame.toHtml()
  73. lxml = html.fromstring(str(result.toAscii()))
  74. self.cb(url, lxml)
  75. self.crawl()
  76.  
  77.  
  78.  
  79.  
  80.  
  81.  
  82.  
  83. # def parse_link(self, response):
  84. # item = PropertyItem()
  85. # item['title'] = Selector(response).xpath('//h1[@class = "kiwii-font-xlarge kiwii-margin-none"]/text()').extract()[0]
  86. # item['url'] = response.url
  87. # item['description'] = Selector(response).xpath('//div[@class = "shortdescription"]/text()').extract()[0]
  88. # item['property_type'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()').extract()
  89. # item['ad_type'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[3]/td[2]/text()').extract()
  90. # item['price'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[1]/td[2]/text()').extract()
  91. # item['location'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[2]/td[2]/text()').extract()
  92. # item['bedrooms'] = Selector(response).xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[5]/td[2]/text()').extract()
  93.  
  94. # tree = get_xpath(response.url)
  95.  
  96. # item['property_type'] = tree.xpath('//table[@class = "kiwii-width-full kiwii-padding-top-xxsmall kiwii-clear-both"]/tbody[1]/tr[4]/td[2]/text()')
  97.  
  98. # item['user'] = Selector(response).xpath('//span[@class = "kiwii-xxdark-gray"]/text()').extract()[0].split()[2]
  99. # item['date'] = Selector(response).xpath('//span[@class = "kiwii-xxdark-gray"]/text()').extract()[0].split()[4]
  100.  
  101. # print_values(item)
Add Comment
Please, Sign In to add comment