import scrapy import sys import logging from scrapy_splash import SplashRequest from scrapy_splash import SplashFormRequest from scrapy.utils.response import open_in_browser from scrapy.shell import inspect_response from ..items import SdsItem from scrapy.selector import Selector logger = logging.getLogger('crawler_logger') class FishersciSpider(scrapy.Spider): name = 'valvoline_search' allowed_domains = ['sds.valvoline.com'] # TODO collect keywords from product listing or send them instead of iterating AAA AAB # TODO collect names which have already been collected custom_settings = { 'HTTPCACHE_ENABLED': True, 'DOWNLOADER_MIDDLEWARES': { # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, # 'scrapy_proxies.RandomProxy': 100, # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810 }, 'LOG_LEVEL': 'DEBUG' # 'ITEM_PIPELINES': { # 'sds.pipelines.SDSValvolinePipelineCSV': 300 # } } # os_version = sys.platform # if 'win32' == os_version: # path_to_keywords = 'C:/Users/steph/PycharmProjects/safety_data_sheets/sds/valvoline_keywords.txt' # else: # path_to_keywords = '/home/projects/sds/sds/valvoline_keywords.txt' # # with open(path_to_keywords) as f: # keywords = f.readlines() keywords = ["Valvoline"] user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' def start_requests(self): # create session for requests for each keyword logger.info('keys {}'.format(len(self.keywords))) search_url = "https://sds.valvoline.com/" for keyword in self.keywords: # for reading from file keyword = keyword.strip(" \n") logger.info('searching by key: {}'.format(keyword)) logger.info('first response to create session done, following up with FormRequest') yield SplashRequest( url=search_url , callback=self.parse , meta={'keyword': keyword} , dont_filter=True ) def parse(self, response): # send request with a keyword keyword = response.meta['keyword'] logger.info('extracted keyword from meta: {}'.format(keyword)) javax_faces_view_state = response.xpath("//*[@id='javax.faces.ViewState']/@value").get() form_dict = { 'materialSearch:j_id49': 'MaterialName', 'materialSearch:j_id55': keyword, 'materialSearch:j_id60': '0', 'materialSearch:j_id66': 'O', 'materialSearch:materialSearch': 'Submit', 'materialSearch_SUBMIT': '1', 'javax.faces.ViewState': javax_faces_view_state } # print(form_dict) yield SplashFormRequest.from_response( response, formdata=form_dict, callback=self.parse_form_request, meta={'keyword': keyword} ) def parse_form_request(self, response): # if no error - for each file check if there are any options keyword = response.meta['keyword'] error_message = response.xpath("//li[@class='errorMessage']/text()").get() if error_message is None: # inspect_response(response, self) # open_in_browser(response) form = response.xpath("//td[@class='width50Pct']/a/@onclick").getall() form.pop(0) # first element for selector above is not needed form = [x[x.find("materialSearch:"):x.rfind("'")] for x in form] # yild only first response to limit the number of requests while code is not working for item in [form[0]]: javax_faces_view_state = response.xpath("//*[@id='javax.faces.ViewState']/@value").get() # .replace("\n", "/").replace("==/", "==") form_dict = { 'materialSearch:j_id49': 'MaterialName', 'materialSearch:j_id55': keyword, 'materialSearch:j_id60': '0', 'materialSearch:j_id66': 'O', 'materialSearch_SUBMIT': '1', 'javax.faces.ViewState': javax_faces_view_state, 'materialSearch:_idcl': item } # TODO not working properly (returns an error) yield SplashFormRequest.from_response( response, formdata=form_dict, callback=self.parse_pdf_page, meta={'keyword': keyword} ) else: logger.info("empty response (no SDS found for keyword)") def parse_pdf_page(self, response): inspect_response(response, self)