Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Lua Script:
- function use_crawlera(splash)
- -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
- -- Have a look at the file spiders/quotes-js.py to see how to do it.
- -- Find your Crawlera credentials in https://app.scrapinghub.com/
- local user = splash.args.crawlera_user
- local host = 'proxy.crawlera.com'
- local port = 8010
- local session_header = 'X-Crawlera-Session'
- local session_id = 'create'
- splash:on_request(function (request)
- -- The commented code below can be used to speed up the crawling
- -- process. They filter requests to undesired domains and useless
- -- resources. Uncomment the ones that make sense to your use case
- -- and add your own rules.
- -- Discard requests to advertising and tracking domains.
- -- if string.find(request.url, 'doubleclick%.net') or
- -- string.find(request.url, 'analytics%.google%.com') then
- -- request.abort()
- -- return
- -- end
- -- Avoid using Crawlera for subresources fetching to increase crawling
- -- speed. The example below avoids using Crawlera for URLS starting
- -- with 'static.' and the ones ending with '.png'.
- if string.find(request.url, '://static%.') ~= nil or
- string.find(request.url, '%.png$') ~= nil or
- string.find(request.url, '%.css') ~=nil or
- string.find(request.url, '%/xjs') ~=nil or
- string.find(request.url, '%.js') ~=nil then
- return
- end
- request:set_header('X-Crawlera-Cookies', 'disable')
- request:set_header('X-Crawlera-profile', 'desktop')
- request:set_header('upgrade-insecure-requests','1')
- request:set_header('upgrade-insecure-requests','1')
- request:set_header('Connection','keep-alive')
- request:set_header('DNT','1')
- request:set_header('X-Crawlera-Timeout', '180000')
- request:set_proxy{host, port, username=user, password=''}
- end)
- splash:on_response_headers(function (response)
- if type(response.headers[session_header]) ~= nil then
- session_id = response.headers[session_header]
- end
- end)
- end
- function main(splash)
- splash.images_enabled = false
- splash.private_mode_enabled = false
- use_crawlera(splash)
- splash:set_user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
- splash:go(splash.args.url)
- splash:wait(10)
- splash:set_viewport_full()
- return {
- jpeg=splash:jpeg(),
- har=splash:har(),
- html=splash:html()
- }
- end
- Spider:
- from pkgutil import get_data
- import scrapy
- from scrapy_splash import SplashRequest
- from w3lib.http import basic_auth_header
- import base64
- class Splashlua(scrapy.Spider):
- name = "splash_example"
- timeout = 60
- def __init__(self, *args, **kwargs):
- # to be able to load the Lua script on Scrapy Cloud, make sure your
- # project's setup.py file contains the "package_data" setting, similar
- # to this project's setup.py
- self.LUA_SOURCE = get_data(
- 'hosted_splash', 'scripts/lua_example.lua'
- ).decode('utf-8')
- super(Splashlua, self).__init__(*args, **kwargs)
- def start_requests(self):
- yield SplashRequest(url='ttps://www.harveynorman.com.sg/',
- callback=self.parse_category,endpoint="execute",
- splash_headers={
- 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
- },
- args={
- 'timeout' : '60',
- 'lua_source': self.LUA_SOURCE,
- 'crawlera_user': self.settings['CRAWLERA_APIKEY']},
- # splash_headers=self.splash_headers,
- cache_args=['lua_source'],)
- def parse_category(self,response):
- pass
- jpeg = response.data['jpeg']
- with open('hosted_immig3.jpeg', 'wb') as f:
- f.write(base64.b64decode(jpeg))
- settings.py
- DOWNLOADER_MIDDLEWARES = {
- # Engine side
- 'scrapy_splash.SplashCookiesMiddleware': 723,
- 'scrapy_splash.SplashMiddleware': 725,
- 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
- 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
- # Downloader side
- }
- SPIDER_MIDDLEWARES = {
- 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
- }
- DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
- HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
- CRAWLERA_APIKEY = 'APIkey'
- SPLASH_URL = 'localhost or Scrapinghub hosted splash URL'
- SPLASH_APIKEY = 'APIkey if using hosted Splash'
Add Comment
Please, Sign In to add comment