Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import re
- import dateutil.parser as dateparser
- import jmespath
- from pp_scraper.spiders.BaseScraper import BaseScraperItemPipeline
- from pp_scraper.mixins import RedisMixin
- GLSITA_BASE_URL = 'https://gls-group.com/app/service/open/rest/IT/it/rstt001?match=%s&type=%s'
- class GLSITAScraper(BaseScraperItemPipeline, RedisMixin):
- name = "GLSITAScraper"
- def __init__(self, name=None, **kwargs):
- super(GLSITAScraper, self).__init__(name, **kwargs)
- customer_code = self.params.get('scraping_parameters').get('customer_code')
- office_code = self.params.get('scraping_parameters').get('office_code')
- if re.match(r'^\w[\w\d]\d{,9}$', self.parcel_id):
- mode = 'NAT'
- elif customer_code and office_code:
- mode = 'DDT'
- else:
- mode = 'DDT'
- self.start_urls = [GLSITA_BASE_URL.format(self.parcel_id, mode)]
- def parse(self, response):
- data = jmespath.search('tuStatus[0]', response.json())
- event_data = data.get('history')
- if not event_data:
- return self.save_result([])
- # Events
- events = []
- for raw_event in event_data:
- event = dict()
- event_type = raw_event.get('evtDscr')
- comment = raw_event.get('note')
- raw_date = raw_event.get('date')
- raw_time = raw_event.get('time')
- address = jmespath('address.city', raw_event)
- event_time = None
- if raw_date and raw_time:
- try:
- event_time = dateparser.parse(
- " ".join([raw_date, raw_time])
- ).isoformat()
- except Exception:
- continue
- if address:
- event['location'] = address
- if comment:
- event['comments'] = comment
- if event_type and event_time:
- event['event_type'] = event_type
- event['event_time'] = event_time
- events.append(event)
- # Additional information
- additional_info = dict()
- parcel_ship_time_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Departure date:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_ship_time_carrier:
- try:
- parsed_time = dateparser.parse(
- parcel_ship_time_carrier, dayfirst=True
- ).strftime('%Y-%m-%d')
- additional_info['parcel_ship_time_carrier'] = parsed_time
- except Exception:
- pass
- parcel_piece_count_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Parcel:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_piece_count_carrier:
- additional_info['parcel_piece_count_carrier'] = parcel_piece_count_carrier
- parcel_weight_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Weight:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_weight_carrier:
- additional_info['parcel_weight_carrier'] = parcel_weight_carrier.replace(
- ',', '.'
- )
- parcel_volume_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Volume:"]]/following-sibling::td[1]'
- '/span/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_volume_carrier:
- additional_info['parcel_volume_carrier'] = parcel_volume_carrier.replace(
- ',', '.'
- )
- parcel_from_locality_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Sender:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_from_locality_carrier:
- additional_info['parcel_from_locality_carrier'] = re.sub(
- r'\s+', ' ', parcel_from_locality_carrier
- )
- parcel_to_locality_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Recipient:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_to_locality_carrier:
- additional_info['parcel_to_locality_carrier'] = re.sub(
- r'\s+', ' ', parcel_to_locality_carrier
- )
- parcel_origin_office_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Depot sender:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_origin_office_carrier:
- additional_info['parcel_origin_office_carrier'] = re.sub(
- r'\s+', ' ', parcel_origin_office_carrier
- )
- parcel_destination_office_carrier = (
- response.xpath(
- '//td[b[normalize-space(text())="Depot recipient:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- )
- .extract_first(default='')
- .strip()
- )
- if parcel_destination_office_carrier:
- additional_info['parcel_destination_office_carrier'] = re.sub(
- r'\s+', ' ', parcel_destination_office_carrier
- )
- parcel_carrier_customer_reference = re.sub(
- r'\s+',
- '',
- response.xpath(
- '//td[b[normalize-space(text())="Shipment number:"]]'
- '/following-sibling::td[1]/span'
- '/text()'
- ).extract_first(default=''),
- )
- if (
- parcel_carrier_customer_reference
- and self.parcel_id != parcel_carrier_customer_reference
- ):
- additional_info[
- 'parcel_carrier_customer_reference'
- ] = parcel_carrier_customer_reference
- parcel_id2 = ''.join(
- response.xpath('//td[@id="inter"]/text()').extract()
- ).strip()
- if parcel_id2 and self.parcel_id != parcel_id2:
- additional_info['parcel_id2'] = [parcel_id2]
- # from pp_scraper.utils import print_pretty_json
- # print_pretty_json(events)
- # print_pretty_json(additional_info)
- return self.save_result(events, **additional_info)
Advertisement
RAW Paste Data
Copied
Advertisement