Guest User

Untitled

a guest
Aug 17th, 2018
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.85 KB | None | 0 0
  1. Lua Script:
  2. function use_crawlera(splash)
  3. -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
  4. -- Have a look at the file spiders/quotes-js.py to see how to do it.
  5. -- Find your Crawlera credentials in https://app.scrapinghub.com/
  6. local user = splash.args.crawlera_user
  7.  
  8. local host = 'proxy.crawlera.com'
  9. local port = 8010
  10. local session_header = 'X-Crawlera-Session'
  11. local session_id = 'create'
  12.  
  13. splash:on_request(function (request)
  14. -- The commented code below can be used to speed up the crawling
  15. -- process. They filter requests to undesired domains and useless
  16. -- resources. Uncomment the ones that make sense to your use case
  17. -- and add your own rules.
  18.  
  19. -- Discard requests to advertising and tracking domains.
  20. -- if string.find(request.url, 'doubleclick%.net') or
  21. -- string.find(request.url, 'analytics%.google%.com') then
  22. -- request.abort()
  23. -- return
  24. -- end
  25.  
  26. -- Avoid using Crawlera for subresources fetching to increase crawling
  27. -- speed. The example below avoids using Crawlera for URLS starting
  28. -- with 'static.' and the ones ending with '.png'.
  29. if string.find(request.url, '://static%.') ~= nil or
  30. string.find(request.url, '%.png$') ~= nil or
  31. string.find(request.url, '%.css') ~=nil or
  32. string.find(request.url, '%.js') ~=nil then
  33. return
  34. end
  35.  
  36. request:set_header('X-Crawlera-Cookies', 'disable')
  37. request:set_header('X-Crawlera-profile', 'desktop')
  38. request:set_header('Accept', '')
  39. request:set_header(session_header, session_id)
  40. request:set_header('X-Crawlera-Timeout', '40000')
  41. request:set_proxy{host, port, username=user, password=''}
  42. end)
  43.  
  44. splash:on_response_headers(function (response)
  45. if type(response.headers[session_header]) ~= nil then
  46. session_id = response.headers[session_header]
  47. end
  48. end)
  49. end
  50.  
  51. function main(splash)
  52.  
  53. splash.private_mode_enabled = false
  54.  
  55.  
  56. use_crawlera(splash)
  57. splash:go(splash.args.url)
  58. splash:wait(2)
  59.  
  60. splash:set_viewport_full()
  61. return {
  62. jpeg=splash:jpeg(),
  63. har=splash:har(),
  64. html=splash:html()
  65. }
  66. end
  67.  
  68.  
  69. Spider:
  70.  
  71. from pkgutil import get_data
  72. import scrapy
  73. from scrapy_splash import SplashRequest
  74. from w3lib.http import basic_auth_header
  75. import base64
  76.  
  77.  
  78. #Changing for Splash example
  79.  
  80. class Splashlua(scrapy.Spider):
  81. name = "splash_example"
  82. timeout = 60
  83.  
  84. def __init__(self, *args, **kwargs):
  85. # to be able to load the Lua script on Scrapy Cloud, make sure your
  86. # project's setup.py file contains the "package_data" setting, similar
  87. # to this project's setup.py
  88. self.LUA_SOURCE = get_data(
  89. 'hosted_splash', 'scripts/lua_example.lua'
  90. ).decode('utf-8')
  91. super(Splashlua, self).__init__(*args, **kwargs)
  92.  
  93.  
  94. def start_requests(self):
  95.  
  96. yield SplashRequest(url='https://www.truepeoplesearch.com/results?streetaddress=4253%20hallmark%20rd&citystatezip=nashville%2c%20tn%2037218&rid=0x0',
  97.  
  98. callback=self.parse_category,endpoint="execute",
  99. splash_headers={
  100. 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
  101. },
  102. args={
  103. 'timeout' : '60',
  104. 'lua_source': self.LUA_SOURCE,
  105. 'crawlera_user': self.settings['CRAWLERA_APIKEY']},
  106. # splash_headers=self.splash_headers,
  107. cache_args=['lua_source'],)
  108.  
  109. def parse_category(self,response):
  110. pass
  111. jpeg = response.data['jpeg']
  112. with open('hosted_immig3.jpeg', 'wb') as f:
  113. f.write(base64.b64decode(jpeg))
Add Comment
Please, Sign In to add comment