Splash_Example

Lua Script:
function use_crawlera(splash)
    -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
    -- Have a look at the file spiders/quotes-js.py to see how to do it.
    -- Find your Crawlera credentials in https://app.scrapinghub.com/
    local user = splash.args.crawlera_user

    local host = 'proxy.crawlera.com'
    local port = 8010
    local session_header = 'X-Crawlera-Session'
    local session_id = 'create'

    splash:on_request(function (request)
        -- The commented code below can be used to speed up the crawling
        -- process. They filter requests to undesired domains and useless
        -- resources. Uncomment the ones that make sense to your use case
        -- and add your own rules.

        -- Discard requests to advertising and tracking domains.
        -- if string.find(request.url, 'doubleclick%.net') or
        --    string.find(request.url, 'analytics%.google%.com') then
        --     request.abort()
        --     return
        -- end

        -- Avoid using Crawlera for subresources fetching to increase crawling
        -- speed. The example below avoids using Crawlera for URLS starting
        -- with 'static.' and the ones ending with '.png'.
        if string.find(request.url, '://static%.') ~= nil or
           string.find(request.url, '%.png$') ~= nil or
           string.find(request.url, '%.css') ~=nil or
           string.find(request.url, '%/xjs') ~=nil or
           string.find(request.url, '%.js') ~=nil then
             return
        end

        request:set_header('X-Crawlera-Cookies', 'disable')
        request:set_header('X-Crawlera-profile', 'desktop')
		request:set_header('upgrade-insecure-requests','1')
        request:set_header('upgrade-insecure-requests','1')
        request:set_header('Connection','keep-alive')
        request:set_header('DNT','1')
        request:set_header('X-Crawlera-Timeout', '180000')
        request:set_proxy{host, port, username=user, password=''}
    end)

    splash:on_response_headers(function (response)
        if type(response.headers[session_header]) ~= nil then
            session_id = response.headers[session_header]
        end
    end)
end

function main(splash)
    splash.images_enabled = false
    splash.private_mode_enabled = false

	use_crawlera(splash)
   	splash:set_user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")

    splash:go(splash.args.url)
    splash:wait(10)

    splash:set_viewport_full()
    return {
      jpeg=splash:jpeg(),
      har=splash:har(),
      html=splash:html()
            }
end


Spider:

from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
import base64


class Splashlua(scrapy.Spider):
    name = "splash_example"
    timeout = 60

    def __init__(self, *args, **kwargs):
        # to be able to load the Lua script on Scrapy Cloud, make sure your
        # project's setup.py file contains the "package_data" setting, similar
        # to this project's setup.py
        self.LUA_SOURCE = get_data(
            'hosted_splash', 'scripts/lua_example.lua'
        ).decode('utf-8')
        super(Splashlua, self).__init__(*args, **kwargs)


    def start_requests(self):

        yield SplashRequest(url='ttps://www.harveynorman.com.sg/',

            callback=self.parse_category,endpoint="execute",
            splash_headers={
                'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
            },
            args={
                'timeout' : '60',
                'lua_source': self.LUA_SOURCE,
                'crawlera_user': self.settings['CRAWLERA_APIKEY']},
               # splash_headers=self.splash_headers,
                cache_args=['lua_source'],)

    def parse_category(self,response):
        pass
        jpeg = response.data['jpeg']
        with open('hosted_immig3.jpeg', 'wb') as f:
            f.write(base64.b64decode(jpeg))


settings.py

DOWNLOADER_MIDDLEWARES = {
    # Engine side
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    # Downloader side
}
SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

CRAWLERA_APIKEY = 'APIkey'

SPLASH_URL = 'localhost or Scrapinghub hosted splash URL'
SPLASH_APIKEY = 'APIkey if using hosted Splash'