Advertisement
Uno-Dan

Untitled

Dec 22nd, 2018
155
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.07 KB | None | 0 0
  1. ########################################################################################################################
  2. #    File: main.py
  3. #  Author: Dan Huckson, https://github.com/unodan
  4. #    Date: 2018-12-22
  5. ########################################################################################################################
  6. # This code may need to be updated if the UNECE changes their format of the page in the link below.
  7. # http://www.unece.org/cefact/codesfortrade/codes_index.html
  8. ########################################################################################################################
  9.  
  10. from os import listdir, remove
  11. from csv import reader
  12. from json import dump
  13. from zipfile import ZipFile
  14. from bs4 import BeautifulSoup
  15. from urllib.request import urlretrieve, urlopen
  16.  
  17. with urlopen('http://www.unece.org/cefact/codesfortrade/codes_index.html') as response:
  18.     html = response.read()
  19.  
  20. soup = BeautifulSoup(html, features="html.parser")
  21. parsed_data = soup.find("div", {"id": "c21211"})
  22.  
  23. unlocode_zip_file = 'loc' + parsed_data.find_all(['td'])[3].text.split()[-1:][0].replace('-', '')[2:] + 'csv.zip'
  24. subdivision_codes_csv_file = parsed_data.find_all(['td'])[3].text.split()[-1:][0] + ' SubdivisionCodes.csv'
  25. country_subdivisions = 'country-subdivisions.json'
  26.  
  27. zones = {}
  28. cities = []
  29. state_codes = []
  30. country_zones = {}
  31. encoding = 'ISO 8859-1'  # Character encoding used by the UNECE for the UN/LOCODE files.
  32.  
  33. # countries = {'CA': 'Canada', 'US': 'United States', 'AU': 'Australia', 'MX': 'Mexico', 'GB': 'United Kingdom'}
  34. countries = {'CA': 'Canada'}
  35.  
  36.  
  37. def get_files():
  38.     file_list = []
  39.  
  40.     for file_name in listdir('.'):
  41.         if 'UNLOCODE' in file_name and file_name.endswith('.csv'):
  42.             file_list.append(file_name)
  43.     return file_list
  44.  
  45.  
  46. def get_zone_file(the_zone):
  47.  
  48.     for file_name in get_unlocode_files():
  49.         with open(file_name, encoding=encoding) as fn:
  50.             line = fn.readlines()
  51.  
  52.         last_country_code_in_file = line[-1].split(',')[1].strip('"')
  53.  
  54.         if the_zone <= last_country_code_in_file:
  55.             return file_name
  56.  
  57.  
  58. def get_unlocode_files():
  59.     unlocode_files = get_files()
  60.  
  61.     if not unlocode_files:
  62.         # Site below is where the UN/LOCODE CSV files are located.
  63.         # http://www.unece.org/cefact/codesfortrade/codes_index.html
  64.         url = 'http://www.unece.org/fileadmin/DAM/cefact/locode/' + unlocode_zip_file
  65.         src_file_name = url.split('/')[-1]
  66.         urlretrieve(url, src_file_name)
  67.  
  68.         with ZipFile(src_file_name, 'r') as z:
  69.             z.extractall('.')
  70.  
  71.         unlocode_files = get_files()
  72.         remove(src_file_name)
  73.  
  74.     return unlocode_files
  75.  
  76.  
  77. get_unlocode_files()
  78.  
  79.  
  80. with open(subdivision_codes_csv_file, encoding=encoding) as f:
  81.     results = reader(f, delimiter=',', quotechar='"')
  82.  
  83.     for row in results:
  84.         country_code, zone_code, zone_name, zone_type = row
  85.         if zone_type == 'Outlying area':
  86.             continue
  87.  
  88.         if country_code in countries:
  89.             country_zones[zone_code] = {
  90.                 'zone': country_code,
  91.                 'code': zone_code,
  92.                 'name': zone_name,
  93.                 'type': zone_type,
  94.             }
  95.  
  96. for country_code, country_name in countries.items():
  97.  
  98.     if country_code not in zones:
  99.         zones[country_code] = {'code': country_code, 'name': country_name, 'zones': {}}
  100.  
  101.     file = get_zone_file(country_code)
  102.     with open(file, encoding=encoding) as f:
  103.         results = reader(f, delimiter=',', quotechar='"')
  104.  
  105.         for row in results:
  106.             cntry_code = row[1]
  107.             zone_code = row[5].strip()
  108.             location_name = row[3]
  109.             location_code = row[2]
  110.             location_flags = row[6]
  111.             location_coordinates = row[10]
  112.  
  113.             if cntry_code == country_code:
  114.                 if '4' in location_flags and ' Apt' in location_name:
  115.                     continue
  116.  
  117.                 if zone_code not in country_zones or location_name.startswith('.'):
  118.                     continue
  119.  
  120.                 country = zones.get(cntry_code)
  121.  
  122.                 if zone_code in country['zones']:
  123.                     zone = country['zones'].get(zone_code)
  124.                 else:
  125.                     country['zones'][zone_code] = {}
  126.                     zone = country['zones'].get(zone_code)
  127.                     zone['country'] = cntry_code
  128.                     zone['code'] = country_zones[zone_code].get('code')
  129.                     zone['name'] = country_zones[zone_code].get('name')
  130.                     zone['type'] = country_zones[zone_code].get('type')
  131.                     zone['locations'] = {}
  132.  
  133.                 if location_code not in zone['locations']:
  134.                     zone['locations'][location_code] = {
  135.                         'code': location_code,
  136.                         'name': location_name,
  137.                         'flags': location_flags,
  138.                         'coordinates': location_coordinates
  139.                     }
  140.  
  141. with open(country_subdivisions, 'w', encoding=encoding) as f:
  142.     dump(zones, f, indent=2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement