Thriveni

Splash_Example

Aug 17th, 2018
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.77 KB | None | 0 0
  1. Lua Script:
  2. function use_crawlera(splash)
  3. -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
  4. -- Have a look at the file spiders/quotes-js.py to see how to do it.
  5. -- Find your Crawlera credentials in https://app.scrapinghub.com/
  6. local user = splash.args.crawlera_user
  7.  
  8. local host = 'proxy.crawlera.com'
  9. local port = 8010
  10. local session_header = 'X-Crawlera-Session'
  11. local session_id = 'create'
  12.  
  13. splash:on_request(function (request)
  14. -- The commented code below can be used to speed up the crawling
  15. -- process. They filter requests to undesired domains and useless
  16. -- resources. Uncomment the ones that make sense to your use case
  17. -- and add your own rules.
  18.  
  19. -- Discard requests to advertising and tracking domains.
  20. -- if string.find(request.url, 'doubleclick%.net') or
  21. -- string.find(request.url, 'analytics%.google%.com') then
  22. -- request.abort()
  23. -- return
  24. -- end
  25.  
  26. -- Avoid using Crawlera for subresources fetching to increase crawling
  27. -- speed. The example below avoids using Crawlera for URLS starting
  28. -- with 'static.' and the ones ending with '.png'.
  29. if string.find(request.url, '://static%.') ~= nil or
  30. string.find(request.url, '%.png$') ~= nil or
  31. string.find(request.url, '%.css') ~=nil or
  32. string.find(request.url, '%/xjs') ~=nil or
  33. string.find(request.url, '%.js') ~=nil then
  34. return
  35. end
  36.  
  37. request:set_header('X-Crawlera-Cookies', 'disable')
  38. request:set_header('X-Crawlera-profile', 'desktop')
  39. request:set_header('upgrade-insecure-requests','1')
  40. request:set_header('upgrade-insecure-requests','1')
  41. request:set_header('Connection','keep-alive')
  42. request:set_header('DNT','1')
  43. request:set_header('X-Crawlera-Timeout', '180000')
  44. request:set_proxy{host, port, username=user, password=''}
  45. end)
  46.  
  47. splash:on_response_headers(function (response)
  48. if type(response.headers[session_header]) ~= nil then
  49. session_id = response.headers[session_header]
  50. end
  51. end)
  52. end
  53.  
  54. function main(splash)
  55. splash.images_enabled = false
  56. splash.private_mode_enabled = false
  57.  
  58. use_crawlera(splash)
  59. splash:set_user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
  60.  
  61. splash:go(splash.args.url)
  62. splash:wait(10)
  63.  
  64. splash:set_viewport_full()
  65. return {
  66. jpeg=splash:jpeg(),
  67. har=splash:har(),
  68. html=splash:html()
  69. }
  70. end
  71.  
  72.  
  73. Spider:
  74.  
  75. from pkgutil import get_data
  76. import scrapy
  77. from scrapy_splash import SplashRequest
  78. from w3lib.http import basic_auth_header
  79. import base64
  80.  
  81.  
  82.  
  83. class Splashlua(scrapy.Spider):
  84. name = "splash_example"
  85. timeout = 60
  86.  
  87. def __init__(self, *args, **kwargs):
  88. # to be able to load the Lua script on Scrapy Cloud, make sure your
  89. # project's setup.py file contains the "package_data" setting, similar
  90. # to this project's setup.py
  91. self.LUA_SOURCE = get_data(
  92. 'hosted_splash', 'scripts/lua_example.lua'
  93. ).decode('utf-8')
  94. super(Splashlua, self).__init__(*args, **kwargs)
  95.  
  96.  
  97. def start_requests(self):
  98.  
  99. yield SplashRequest(url='ttps://www.harveynorman.com.sg/',
  100.  
  101. callback=self.parse_category,endpoint="execute",
  102. splash_headers={
  103. 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
  104. },
  105. args={
  106. 'timeout' : '60',
  107. 'lua_source': self.LUA_SOURCE,
  108. 'crawlera_user': self.settings['CRAWLERA_APIKEY']},
  109. # splash_headers=self.splash_headers,
  110. cache_args=['lua_source'],)
  111.  
  112. def parse_category(self,response):
  113. pass
  114. jpeg = response.data['jpeg']
  115. with open('hosted_immig3.jpeg', 'wb') as f:
  116. f.write(base64.b64decode(jpeg))
  117.  
  118.  
  119.  
  120. settings.py
  121.  
  122. DOWNLOADER_MIDDLEWARES = {
  123. # Engine side
  124. 'scrapy_splash.SplashCookiesMiddleware': 723,
  125. 'scrapy_splash.SplashMiddleware': 725,
  126. 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
  127. 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
  128. # Downloader side
  129. }
  130. SPIDER_MIDDLEWARES = {
  131. 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
  132. }
  133.  
  134. DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
  135. HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
  136.  
  137. CRAWLERA_APIKEY = 'APIkey'
  138.  
  139. SPLASH_URL = 'localhost or Scrapinghub hosted splash URL'
  140. SPLASH_APIKEY = 'APIkey if using hosted Splash'
Add Comment
Please, Sign In to add comment