Advertisement
Merevoli

Untitled

May 26th, 2022
533
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from contextlib import suppress
  2.  
  3. import dateutil.parser as dateparser
  4. from pp_scraper import activity_logger
  5. from pp_scraper.spiders.BaseScraper import BaseScraperItemPipeline
  6. from scrapy.http import FormRequest
  7. from scrapy_splash import SplashRequest
  8.  
  9. TFOFRE_BASE_URL = "https://www.tforcefreight.com/ltl/apps/Tracking"
  10.  
  11. DEFAULT_SCRIPT = """
  12. function main(splash, args)
  13.  assert(splash:go(args.url))
  14.  assert(splash:wait(10))
  15.  return {
  16.    html = splash:html(),
  17.    cookies = splash:get_cookies(),
  18.  }
  19. end
  20. """
  21.  
  22.  
  23. class TFOFREScraper(BaseScraperItemPipeline):
  24.     name = "TFOFREScraper"
  25.  
  26.     def __init__(self, name=None, **kwargs):
  27.         super().__init__(name, **kwargs)
  28.         self.start_urls = [TFOFRE_BASE_URL]
  29.         self.ip_proxy = self.strategy.ip_proxy
  30.  
  31.     def start_requests(self):
  32.         for url in self.start_urls:
  33.             yield self.produce_request(
  34.                 SplashRequest,
  35.                 url,
  36.                 args={
  37.                     'lua_source': DEFAULT_SCRIPT,
  38.                 },
  39.                 endpoint='execute',
  40.                 callback=self.parse,
  41.                 errback=self.error_call_back,
  42.                 cache_args=['lua_source'],
  43.             )
  44.  
  45.     def parse(self, response):
  46.         token = response.css(
  47.             "input[name=__RequestVerificationToken]::attr(value)"
  48.         ).get()
  49.  
  50.         if not token:
  51.             return self.save_result([])
  52.         print(token)
  53.  
  54.         try:
  55.             return self.ip_proxy.randomize_IP(
  56.                 FormRequest.from_response(
  57.                     response,
  58.                     formid="trackingByProForm",
  59.                     method="POST",
  60.                     formdata={
  61.                         "__RequestVerificationToken": token,
  62.                         "ProNumbers": self.parcel_id,
  63.                     },
  64.                     callback=self.track_parcel,
  65.                     errback=self.error_call_back,
  66.                 )
  67.             )
  68.             # return self.strategy.produce_request(
  69.             #     FormRequest,
  70.             #     url=TFOFRE_BASE_URL,
  71.             #     formdata={
  72.             #         "__RequestVerificationToken": token,
  73.             #         "ProNumbers": self.parcel_id,
  74.             #     },
  75.             #     callback=self.parse,
  76.             #     errback=self.error_call_back,
  77.             # )
  78.         except Exception:
  79.             activity_logger.info(f"{self.name} {self.parcel_id} no form found")
  80.             return self.save_result([])
  81.  
  82.     def track_parcel(self, response):
  83.         import pdb
  84.  
  85.         pdb.set_trace()
  86.         event_list = response.css("#deliveredDetailsRow1 tbody > tr")
  87.         if not event_list:
  88.             return self.save_result([])
  89.  
  90.         events = []
  91.         for event_item in event_list:
  92.             event = {}
  93.  
  94.             try:
  95.                 datetime = event_item.css(
  96.                     "td:nth-child(2)::text, td:nth-child(3)::text"
  97.                 ).getall()
  98.                 event_time = " ".join(datetime)
  99.                 event["event_time"] = dateparser.parse(event_time).isoformat()
  100.             except Exception:
  101.                 continue
  102.  
  103.             event_type = event_item.css("td:nth-child(4)::text").get("").strip()
  104.             if not event_type:
  105.                 continue
  106.             event["event_type"] = event_type
  107.  
  108.             event_location = event_item.css("td::text").get("").strip()
  109.             if event_location:
  110.                 event["event_location"] = event_location
  111.  
  112.             if event.get("event_time") and event.get("event_type"):
  113.                 events.append(event)
  114.  
  115.         extra_info = {}
  116.  
  117.         status_selector = "#trackingAccordion1 > div {}::text"
  118.         parcel_carrier_status = (
  119.             (
  120.                 response.css(status_selector.format("label"))
  121.                 or response.css(status_selector.format("h3"))
  122.             )
  123.             .get("")
  124.             .strip()
  125.         )
  126.         if parcel_carrier_status:
  127.             extra_info["parcel_carrier_status"] = parcel_carrier_status
  128.  
  129.         delivered_datetime = response.css(
  130.             "#deliveredOnDate::text, #deliveredAtTime::text"
  131.         ).getall()
  132.         delivered_date_carrier = " ".join(delivered_datetime)
  133.         with suppress(Exception):
  134.             extra_info["delivered_date_carrier"] = dateparser.parse(
  135.                 delivered_date_carrier
  136.             ).isoformat()
  137.  
  138.         parcel_received_by = response.css("#deliveredSignedBy::text").get("").strip()
  139.         if parcel_received_by:
  140.             extra_info["parcel_received_by"] = parcel_received_by
  141.  
  142.         parcel_product = response.css("#deliveredService::text").get("").strip()
  143.         if parcel_product:
  144.             extra_info["parcel_product"] = parcel_product
  145.  
  146.         name_selector = '#trackingAccordion1 div:contains("{}") + div label::text'
  147.         parcel_to_name_carrier = (
  148.             response.css(name_selector.format("Ship To")).get("").strip()
  149.         )
  150.         if parcel_to_name_carrier:
  151.             extra_info["parcel_to_name_carrier"] = parcel_to_name_carrier
  152.  
  153.         parcel_from_name_carrier = (
  154.             response.css(name_selector.format("Ship From")).get("").strip()
  155.         )
  156.         if parcel_from_name_carrier:
  157.             extra_info["parcel_from_name_carrier"] = parcel_from_name_carrier
  158.  
  159.         to_location = response.css("#lblOrigShipToCityState::text").get("").strip()
  160.         with suppress(Exception):
  161.             to_location_list = to_location.split()
  162.  
  163.         with suppress(Exception):
  164.             parcel_to_locality_carrier = " ".join(to_location_list[:-2])
  165.             extra_info["parcel_to_locality_carrier"] = parcel_to_locality_carrier
  166.  
  167.         with suppress(Exception):
  168.             to_country_code = to_location_list[-1]
  169.             extra_info[
  170.                 "parcel_to_country_carrier"
  171.             ] = self.get_country_name_from_country_code(to_country_code)
  172.  
  173.         from_location = response.css("#lbldestShipToCityState::text").get("").strip()
  174.         with suppress(Exception):
  175.             from_location_list = from_location.split()
  176.  
  177.         with suppress(Exception):
  178.             parcel_from_locality_carrier = " ".join(from_location_list[:-2])
  179.             extra_info["parcel_from_locality_carrier"] = parcel_from_locality_carrier
  180.  
  181.         with suppress(Exception):
  182.             from_country_code = from_location_list[-1]
  183.             extra_info[
  184.                 "parcel_from_country_carrier"
  185.             ] = self.get_country_name_from_country_code(from_country_code)
  186.  
  187.         info_selector = '#deliveredDetailsRow1 div:contains("{}") + div label::text'
  188.         pieces = response.css(info_selector.format("Pieces")).get("").strip()
  189.         if pieces and pieces.isdigit():
  190.             extra_info["parcel_piece_count_carrier"] = int(pieces)
  191.  
  192.         weight = response.css(info_selector.format("Weight")).get("").strip()
  193.         if weight:
  194.             extra_info["parcel_weight_carrier"] = weight
  195.  
  196.         # print(events)
  197.         # print(extra_info)
  198.         return self.save_result(events, **extra_info)
  199.  
Advertisement
RAW Paste Data Copied
Advertisement