Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- import sys
- import logging
- from scrapy_splash import SplashRequest
- from scrapy_splash import SplashFormRequest
- from scrapy.utils.response import open_in_browser
- from scrapy.shell import inspect_response
- from ..items import SdsItem
- from scrapy.selector import Selector
- logger = logging.getLogger('crawler_logger')
- class FishersciSpider(scrapy.Spider):
- name = 'valvoline_search'
- allowed_domains = ['sds.valvoline.com']
- # TODO collect keywords from product listing or send them instead of iterating AAA AAB
- # TODO collect names which have already been collected
- custom_settings = {
- 'HTTPCACHE_ENABLED': True,
- 'DOWNLOADER_MIDDLEWARES': {
- # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
- # 'scrapy_proxies.RandomProxy': 100,
- # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
- 'scrapy_splash.SplashCookiesMiddleware': 723,
- 'scrapy_splash.SplashMiddleware': 725,
- 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810
- },
- 'LOG_LEVEL': 'DEBUG'
- # 'ITEM_PIPELINES': {
- # 'sds.pipelines.SDSValvolinePipelineCSV': 300
- # }
- }
- # os_version = sys.platform
- # if 'win32' == os_version:
- # path_to_keywords = 'C:/Users/steph/PycharmProjects/safety_data_sheets/sds/valvoline_keywords.txt'
- # else:
- # path_to_keywords = '/home/projects/sds/sds/valvoline_keywords.txt'
- #
- # with open(path_to_keywords) as f:
- # keywords = f.readlines()
- keywords = ["Valvoline"]
- user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
- def start_requests(self):
- # create session for requests for each keyword
- logger.info('keys {}'.format(len(self.keywords)))
- search_url = "https://sds.valvoline.com/"
- for keyword in self.keywords:
- # for reading from file
- keyword = keyword.strip(" \n")
- logger.info('searching by key: {}'.format(keyword))
- logger.info('first response to create session done, following up with FormRequest')
- yield SplashRequest(
- url=search_url
- , callback=self.parse
- , meta={'keyword': keyword}
- , dont_filter=True
- )
- def parse(self, response):
- # send request with a keyword
- keyword = response.meta['keyword']
- logger.info('extracted keyword from meta: {}'.format(keyword))
- javax_faces_view_state = response.xpath("//*[@id='javax.faces.ViewState']/@value").get()
- form_dict = {
- 'materialSearch:j_id49': 'MaterialName',
- 'materialSearch:j_id55': keyword,
- 'materialSearch:j_id60': '0',
- 'materialSearch:j_id66': 'O',
- 'materialSearch:materialSearch': 'Submit',
- 'materialSearch_SUBMIT': '1',
- 'javax.faces.ViewState': javax_faces_view_state
- }
- # print(form_dict)
- yield SplashFormRequest.from_response(
- response,
- formdata=form_dict,
- callback=self.parse_form_request,
- meta={'keyword': keyword}
- )
- def parse_form_request(self, response):
- # if no error - for each file check if there are any options
- keyword = response.meta['keyword']
- error_message = response.xpath("//li[@class='errorMessage']/text()").get()
- if error_message is None:
- # inspect_response(response, self)
- # open_in_browser(response)
- form = response.xpath("//td[@class='width50Pct']/a/@onclick").getall()
- form.pop(0) # first element for selector above is not needed
- form = [x[x.find("materialSearch:"):x.rfind("'")] for x in form]
- # yild only first response to limit the number of requests while code is not working
- for item in [form[0]]:
- javax_faces_view_state = response.xpath("//*[@id='javax.faces.ViewState']/@value").get()
- # .replace("\n", "/").replace("==/", "==")
- form_dict = {
- 'materialSearch:j_id49': 'MaterialName',
- 'materialSearch:j_id55': keyword,
- 'materialSearch:j_id60': '0',
- 'materialSearch:j_id66': 'O',
- 'materialSearch_SUBMIT': '1',
- 'javax.faces.ViewState': javax_faces_view_state,
- 'materialSearch:_idcl': item
- }
- # TODO not working properly (returns an error)
- yield SplashFormRequest.from_response(
- response,
- formdata=form_dict,
- callback=self.parse_pdf_page,
- meta={'keyword': keyword}
- )
- else:
- logger.info("empty response (no SDS found for keyword)")
- def parse_pdf_page(self, response):
- inspect_response(response, self)
Advertisement
Add Comment
Please, Sign In to add comment