Advertisement
Merevoli

Untitled

May 17th, 2022
37
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from datetime import datetime
  2. from scrapy.http import FormRequest, Request
  3. from pp_scraper.mixins import PostCodeMixin
  4. from pp_scraper import activity_logger
  5. from pp_scraper.proxies import LuminatiSuperProxy, ProxyMesh, USLuminatiSuperProxy
  6.  
  7. from pp_scraper.spiders.BaseScraper import BaseScraperFlinkV2, BaseScraperItemPipeline
  8.  
  9. TUFFPA_BASE_URL = "https://www.tuffnells.co.uk/track-and-trace"
  10.  
  11.  
  12. class TUFFPAScraper(BaseScraperItemPipeline, PostCodeMixin):
  13. name = "TUFFPAScraper"
  14.  
  15. def __init__(self, name=None, **kwargs):
  16. super(TUFFPAScraper, self).__init__(name, **kwargs)
  17. parcel_id, self.postcode = self.extract_post_code()
  18. # Source site has ip banning and tracking limit for each parcel
  19. # I don't know the limit, but the cooldown seems fast on local testing
  20. # Luminati is all banned, only ProxyMesh works, though a bit slow
  21. self.ip_proxy = ProxyMesh()
  22. # self.ip_proxy = self.strategy.ip_proxy
  23. self.start_urls = [TUFFPA_BASE_URL]
  24.  
  25. def start_requests(self):
  26. if not self.postcode:
  27. activity_logger.info(
  28. f"{self.name} {self.parcel_id} missing post_code, returning empty list"
  29. )
  30. return self.save_result([])
  31. for url in self.start_urls:
  32. yield self.produce_request(
  33. Request, url, callback=self.parse, errback=self.error_call_back
  34. )
  35.  
  36. def parse(self, response):
  37. # honeypot_time = response.css('[name=honeypot_time]').attrib.get('value')
  38. # if not honeypot_time:
  39. # return self.save_result([])
  40.  
  41. honeypot_time = "mklMG5mHUXEe1Mg1-b_TpeOusr0NitHikkor9eHIIOw"
  42.  
  43. try:
  44. return self.ip_proxy.randomize_IP(
  45. FormRequest.from_response(
  46. response,
  47. formdata={
  48. "r_urn": self.parcel_id,
  49. "r_postcode": self.postcode,
  50. "honeypot_time": honeypot_time,
  51. "form_id": "tuffnells_trackntrace_form",
  52. "op": "Find my item",
  53. },
  54. callback=self.extract_data,
  55. errback=self.error_call_back,
  56. )
  57. )
  58. except Exception:
  59. activity_logger.info(
  60. "{} {} no form found".format(self.name, self.parcel_id)
  61. )
  62. return self.queue_for_retry(should_release_lock=True)
  63.  
  64. def extract_data(self, response):
  65. raw = response.css('#edit-result')
  66.  
  67. # https://www.pivotaltracker.com/story/show/176217309/comments/220890860
  68. # Get current time as event_time
  69. event_time = datetime.now().isoformat()
  70. event_type = raw.css('p::text').get('').strip()
  71.  
  72. event_list = []
  73.  
  74. not_found = "it looks like there aren't any consignments with this ID"
  75.  
  76. if event_time and event_type and not_found not in event_type:
  77. event_list.append({'event_time': event_time, 'event_type': event_type})
  78.  
  79. if not event_list:
  80. return self.save_result([])
  81.  
  82. extra_info = dict()
  83.  
  84. parcel_pod_carrier = raw.css('img::attr(src)').get('')
  85. if parcel_pod_carrier:
  86. extra_info['parcel_pod_carrier'] = parcel_pod_carrier
  87.  
  88. # print(event_list)
  89. # print(extra_info)
  90. return self.save_result(event_list, **extra_info)
  91.  
Advertisement
RAW Paste Data Copied
Advertisement