SmirnovStepan

valvoline

Jul 27th, 2020
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.10 KB | None | 0 0
  1. import scrapy
  2. import sys
  3. import logging
  4. from scrapy_splash import SplashRequest
  5. from scrapy_splash import SplashFormRequest
  6. from scrapy.utils.response import open_in_browser
  7. from scrapy.shell import inspect_response
  8. from ..items import SdsItem
  9. from scrapy.selector import Selector
  10.  
  11. logger = logging.getLogger('crawler_logger')
  12.  
  13.  
  14. class FishersciSpider(scrapy.Spider):
  15. name = 'valvoline_search'
  16. allowed_domains = ['sds.valvoline.com']
  17.  
  18. # TODO collect keywords from product listing or send them instead of iterating AAA AAB
  19. # TODO collect names which have already been collected
  20. custom_settings = {
  21. 'HTTPCACHE_ENABLED': True,
  22. 'DOWNLOADER_MIDDLEWARES': {
  23. # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
  24. # 'scrapy_proxies.RandomProxy': 100,
  25. # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
  26. 'scrapy_splash.SplashCookiesMiddleware': 723,
  27. 'scrapy_splash.SplashMiddleware': 725,
  28. 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810
  29. },
  30. 'LOG_LEVEL': 'DEBUG'
  31. # 'ITEM_PIPELINES': {
  32. # 'sds.pipelines.SDSValvolinePipelineCSV': 300
  33. # }
  34. }
  35. # os_version = sys.platform
  36. # if 'win32' == os_version:
  37. # path_to_keywords = 'C:/Users/steph/PycharmProjects/safety_data_sheets/sds/valvoline_keywords.txt'
  38. # else:
  39. # path_to_keywords = '/home/projects/sds/sds/valvoline_keywords.txt'
  40. #
  41. # with open(path_to_keywords) as f:
  42. # keywords = f.readlines()
  43. keywords = ["Valvoline"]
  44. user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
  45.  
  46. def start_requests(self):
  47. # create session for requests for each keyword
  48. logger.info('keys {}'.format(len(self.keywords)))
  49. search_url = "https://sds.valvoline.com/"
  50.  
  51. for keyword in self.keywords:
  52. # for reading from file
  53. keyword = keyword.strip(" \n")
  54.  
  55. logger.info('searching by key: {}'.format(keyword))
  56. logger.info('first response to create session done, following up with FormRequest')
  57. yield SplashRequest(
  58. url=search_url
  59. , callback=self.parse
  60. , meta={'keyword': keyword}
  61. , dont_filter=True
  62. )
  63.  
  64. def parse(self, response):
  65. # send request with a keyword
  66. keyword = response.meta['keyword']
  67. logger.info('extracted keyword from meta: {}'.format(keyword))
  68.  
  69. javax_faces_view_state = response.xpath("//*[@id='javax.faces.ViewState']/@value").get()
  70. form_dict = {
  71. 'materialSearch:j_id49': 'MaterialName',
  72. 'materialSearch:j_id55': keyword,
  73. 'materialSearch:j_id60': '0',
  74. 'materialSearch:j_id66': 'O',
  75. 'materialSearch:materialSearch': 'Submit',
  76. 'materialSearch_SUBMIT': '1',
  77. 'javax.faces.ViewState': javax_faces_view_state
  78. }
  79. # print(form_dict)
  80. yield SplashFormRequest.from_response(
  81. response,
  82. formdata=form_dict,
  83. callback=self.parse_form_request,
  84. meta={'keyword': keyword}
  85. )
  86.  
  87. def parse_form_request(self, response):
  88. # if no error - for each file check if there are any options
  89. keyword = response.meta['keyword']
  90.  
  91. error_message = response.xpath("//li[@class='errorMessage']/text()").get()
  92. if error_message is None:
  93. # inspect_response(response, self)
  94. # open_in_browser(response)
  95. form = response.xpath("//td[@class='width50Pct']/a/@onclick").getall()
  96. form.pop(0) # first element for selector above is not needed
  97. form = [x[x.find("materialSearch:"):x.rfind("'")] for x in form]
  98.  
  99. # yild only first response to limit the number of requests while code is not working
  100. for item in [form[0]]:
  101.  
  102. javax_faces_view_state = response.xpath("//*[@id='javax.faces.ViewState']/@value").get()
  103. # .replace("\n", "/").replace("==/", "==")
  104.  
  105. form_dict = {
  106. 'materialSearch:j_id49': 'MaterialName',
  107. 'materialSearch:j_id55': keyword,
  108. 'materialSearch:j_id60': '0',
  109. 'materialSearch:j_id66': 'O',
  110. 'materialSearch_SUBMIT': '1',
  111. 'javax.faces.ViewState': javax_faces_view_state,
  112. 'materialSearch:_idcl': item
  113. }
  114.  
  115. # TODO not working properly (returns an error)
  116. yield SplashFormRequest.from_response(
  117. response,
  118. formdata=form_dict,
  119. callback=self.parse_pdf_page,
  120. meta={'keyword': keyword}
  121. )
  122. else:
  123. logger.info("empty response (no SDS found for keyword)")
  124.  
  125. def parse_pdf_page(self, response):
  126. inspect_response(response, self)
  127.  
Advertisement
Add Comment
Please, Sign In to add comment