Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ########################################################################################################################
- # File: main.py
- # Author: Dan Huckson, https://github.com/unodan
- # Date: 2018-12-22
- ########################################################################################################################
- # This code may need to be updated if the UNECE changes their format of the page in the link below.
- # http://www.unece.org/cefact/codesfortrade/codes_index.html
- ########################################################################################################################
- from os import listdir, remove
- from csv import reader
- from json import dump
- from zipfile import ZipFile
- from bs4 import BeautifulSoup
- from urllib.request import urlretrieve, urlopen
- with urlopen('http://www.unece.org/cefact/codesfortrade/codes_index.html') as response:
- html = response.read()
- soup = BeautifulSoup(html, features="html.parser")
- parsed_data = soup.find("div", {"id": "c21211"})
- unlocode_zip_file = 'loc' + parsed_data.find_all(['td'])[3].text.split()[-1:][0].replace('-', '')[2:] + 'csv.zip'
- subdivision_codes_csv_file = parsed_data.find_all(['td'])[3].text.split()[-1:][0] + ' SubdivisionCodes.csv'
- country_subdivisions = 'country-subdivisions.json'
- zones = {}
- cities = []
- state_codes = []
- country_zones = {}
- encoding = 'ISO 8859-1' # Character encoding used by the UNECE for the UN/LOCODE files.
- # countries = {'CA': 'Canada', 'US': 'United States', 'AU': 'Australia', 'MX': 'Mexico', 'GB': 'United Kingdom'}
- countries = {'CA': 'Canada'}
- def get_files():
- file_list = []
- for file_name in listdir('.'):
- if 'UNLOCODE' in file_name and file_name.endswith('.csv'):
- file_list.append(file_name)
- return file_list
- def get_zone_file(the_zone):
- for file_name in get_unlocode_files():
- with open(file_name, encoding=encoding) as fn:
- line = fn.readlines()
- last_country_code_in_file = line[-1].split(',')[1].strip('"')
- if the_zone <= last_country_code_in_file:
- return file_name
- def get_unlocode_files():
- unlocode_files = get_files()
- if not unlocode_files:
- # Site below is where the UN/LOCODE CSV files are located.
- # http://www.unece.org/cefact/codesfortrade/codes_index.html
- url = 'http://www.unece.org/fileadmin/DAM/cefact/locode/' + unlocode_zip_file
- src_file_name = url.split('/')[-1]
- urlretrieve(url, src_file_name)
- with ZipFile(src_file_name, 'r') as z:
- z.extractall('.')
- unlocode_files = get_files()
- remove(src_file_name)
- return unlocode_files
- get_unlocode_files()
- with open(subdivision_codes_csv_file, encoding=encoding) as f:
- results = reader(f, delimiter=',', quotechar='"')
- for row in results:
- country_code, zone_code, zone_name, zone_type = row
- if zone_type == 'Outlying area':
- continue
- if country_code in countries:
- country_zones[zone_code] = {
- 'zone': country_code,
- 'code': zone_code,
- 'name': zone_name,
- 'type': zone_type,
- }
- for country_code, country_name in countries.items():
- if country_code not in zones:
- zones[country_code] = {'code': country_code, 'name': country_name, 'zones': {}}
- file = get_zone_file(country_code)
- with open(file, encoding=encoding) as f:
- results = reader(f, delimiter=',', quotechar='"')
- for row in results:
- cntry_code = row[1]
- zone_code = row[5].strip()
- location_name = row[3]
- location_code = row[2]
- location_flags = row[6]
- location_coordinates = row[10]
- if cntry_code == country_code:
- if '4' in location_flags and ' Apt' in location_name:
- continue
- if zone_code not in country_zones or location_name.startswith('.'):
- continue
- country = zones.get(cntry_code)
- if zone_code in country['zones']:
- zone = country['zones'].get(zone_code)
- else:
- country['zones'][zone_code] = {}
- zone = country['zones'].get(zone_code)
- zone['country'] = cntry_code
- zone['code'] = country_zones[zone_code].get('code')
- zone['name'] = country_zones[zone_code].get('name')
- zone['type'] = country_zones[zone_code].get('type')
- zone['locations'] = {}
- if location_code not in zone['locations']:
- zone['locations'][location_code] = {
- 'code': location_code,
- 'name': location_name,
- 'flags': location_flags,
- 'coordinates': location_coordinates
- }
- with open(country_subdivisions, 'w', encoding=encoding) as f:
- dump(zones, f, indent=2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement