Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import platform
- import sys
- import numpy as np
- import json
- import re
- from pprint import pprint
- from datetime import timedelta, date
- # Check platform for setting folder separator
- if platform.system() == "Windows":
- fs = "\\"
- else:
- fs = "/"
- # Set the default encoding as 'utf-8'
- if sys.version_info < (3, 0):
- reload(sys)
- sys.setdefaultencoding('utf-8')
- # Select dates of the json file
- years = [2014, 2015, 2016]
- skip_missing_dates = 1 # Skips the missing dates (0: No, 1: Yes)
- DW_npData = []
- destinations = []
- # File locations for output
- loc_DW_npData = ".." + fs + "Data" + fs + "DelayData_WeatherData" + fs + "DW_npData_training"
- loc_destinations = ".." + fs + "Data" + fs + "Destinations_training"
- # Save delay data from json files to a numpy arrays
- def daterange(start_date, end_date):
- for n in range(int((end_date - start_date).days) + 1):
- yield start_date + timedelta(n)
- # Convert destinations to numbers
- def convertDestination(destination):
- if destination not in destinations:
- destinations.append(destination)
- return destinations.index(destination)
- total_files = 0
- missing_files = 0
- wrong_files = 0
- skipped_rows = 0
- for year in years:
- # Date range
- print("Converting " + str(year) + " to numpy arrays:")
- start = date(year, 1, 1)
- end = date(year, 12, 31)
- for YMD in daterange(start,end):
- d = str(YMD.strftime("%Y%m%d"))
- total_files += 1
- # File location
- file = ".." + fs + "Webscraper" + fs + str(year) + fs + "departmentinfo_UT_" + str(year) + "_" + str(int(YMD.strftime("%m"))) + "_" + str(int(YMD.strftime("%d"))) + ".json"
- if skip_missing_dates == 1:
- try:
- Data = json.load(open(file))
- except FileNotFoundError:
- missing_files += 1
- print(" " + file)
- print(" File does not exist, so skipping it...")
- continue
- else:
- Data = json.load(open(file))
- try:
- len(Data["department_info"][d])
- except KeyError:
- wrong_files += 1
- print(" " + file)
- print(" File does exist, but does not contain useful information, so skipping it")
- continue
- for row in range(1, len(Data["department_info"][d])):
- month = d[4:6]
- day = d[-2:]
- try:
- Data["department_info"][d][str(row)]["department_nextarrival"]["time"]
- except KeyError:
- skipped_rows += 1
- print(" " + file)
- print(" Skipped the row after the following time because it doesn't exists in the json file: " + str(hour) + ":" + str(minute))
- continue
- hour, minute = str(Data["department_info"][d][str(row)]["department_nextarrival"]["time"]).split(".")
- depart_delay = str(Data["department_info"][d][str(row)]["department_nextarrival"]["current_depart_delay"])
- arrival_delay = str(Data["department_info"][d][str(row)]["department_nextarrival"]["next_arrival_delay"])
- if depart_delay == "None":
- depart_delay = 0
- else:
- depart_delay = re.findall('\d+', depart_delay)
- depart_delay = depart_delay[0]
- if arrival_delay == "None":
- arrival_delay = 0
- else:
- arrival_delay = re.findall('\d+', arrival_delay)
- arrival_delay = arrival_delay[0]
- delay = int(arrival_delay) - int(depart_delay)
- delay = max(0, delay)
- destination = str(Data["department_info"][d][str(row)]["department_nextarrival"]["final_destination"])
- destination = convertDestination(destination)
- track = str(Data["department_info"][d][str(row)]["department_nextarrival"]["track"])
- if track == "None":
- track = 0
- else:
- track = re.findall('\d+', track)
- track = track[0]
- wind_speed = str(Data["department_info"][d][str(row)]["department_nextarrival"]["wind_speed"])
- temperature = str(Data["department_info"][d][str(row)]["department_nextarrival"]["temperature"])
- snow = str(Data["department_info"][d][str(row)]["department_nextarrival"]["snow"])
- DW_npData.append(np.array([int(month), int(day), int(hour), int(minute), int(delay), int(destination), int(track), int(wind_speed), int(temperature), int(snow)]))
- # Print an example of delay data for a specific time
- print("\nFirst 10 rows in DW_npData:")
- print(" mon day hr min del des tr w tem snow")
- for i in range(0, 10):
- print(" " + str(DW_npData[i]))
- print("\nLast 10 rows in DW_npData:")
- print(" mon day hr min del des tr w tem snow")
- for i in range(0, 10):
- print(" " + str(DW_npData[len(DW_npData) - 1 - (10 - i)]))
- print("\nNumber of rows in DW_npData:")
- print(" " + str(len(DW_npData)))
- print("\nSaving DW_npData to file:")
- print(" " + loc_DW_npData)
- np.save(loc_DW_npData, DW_npData)
- print("\nSaving destinations to file:")
- print(" " + loc_destinations)
- np.save(loc_destinations, destinations)
- print("\nFrom the total of " + str(total_files) + " files:")
- print(" " + str(missing_files) + " missing files")
- print(" " + str(wrong_files) + " wrong files")
- print(" " + str(skipped_rows) + " skipped rows\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement