Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy_selenium import SeleniumRequest
- import logging
- from shutil import which
- class SigmaSpider(scrapy.Spider):
- name = 'sigma_help'
- allowed_domains = ['sigmaaldrich.com']
- custom_settings = {
- 'HTTPCACHE_ENABLED': False,
- 'SELENIUM_DRIVER_NAME': 'chrome',
- 'SELENIUM_DRIVER_EXECUTABLE_PATH': which('chromedriver'),
- 'SELENIUM_DRIVER_ARGUMENTS': ['--headless'],
- 'DOWNLOADER_MIDDLEWARES': {
- 'scrapy_selenium.SeleniumMiddleware': 800
- },
- # 'CONCURRENT_REQUESTS': 1,
- 'RETRY_TIMES': 1,
- 'DOWNLOAD_DELAY': 5
- }
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"
- def start_requests(self):
- # set location to Norway
- js_code = "document.querySelector('.countriesRight+ .countriesRight a:nth-child(17)').click()"
- yield SeleniumRequest(url="https://www.sigmaaldrich.com", callback=self.set_location, script=js_code, wait_time=10)
- # check if location was set
- # yield SeleniumRequest(url="https://www.sigmaaldrich.com", callback=self.set_location, dont_filter=True)
- def set_location(self, response):
- from scrapy.shell import inspect_response
- inspect_response(response, self)
Add Comment
Please, Sign In to add comment