Advertisement
Guest User

Untitled

a guest
Jul 16th, 2019
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.57 KB | None | 0 0
  1. import platform
  2. import sys
  3. import numpy as np
  4. import json
  5. import re
  6. from pprint import pprint
  7. from datetime import timedelta, date
  8.  
  9. # Check platform for setting folder separator
  10. if platform.system() == "Windows":
  11. fs = "\\"
  12. else:
  13. fs = "/"
  14.  
  15. # Set the default encoding as 'utf-8'
  16. if sys.version_info < (3, 0):
  17. reload(sys)
  18. sys.setdefaultencoding('utf-8')
  19.  
  20. # Select dates of the json file
  21. years = [2014, 2015, 2016]
  22. skip_missing_dates = 1 # Skips the missing dates (0: No, 1: Yes)
  23. DW_npData = []
  24. destinations = []
  25.  
  26. # File locations for output
  27. loc_DW_npData = ".." + fs + "Data" + fs + "DelayData_WeatherData" + fs + "DW_npData_training"
  28. loc_destinations = ".." + fs + "Data" + fs + "Destinations_training"
  29.  
  30. # Save delay data from json files to a numpy arrays
  31. def daterange(start_date, end_date):
  32. for n in range(int((end_date - start_date).days) + 1):
  33. yield start_date + timedelta(n)
  34.  
  35. # Convert destinations to numbers
  36. def convertDestination(destination):
  37. if destination not in destinations:
  38. destinations.append(destination)
  39.  
  40. return destinations.index(destination)
  41.  
  42. total_files = 0
  43. missing_files = 0
  44. wrong_files = 0
  45. skipped_rows = 0
  46.  
  47. for year in years:
  48. # Date range
  49. print("Converting " + str(year) + " to numpy arrays:")
  50. start = date(year, 1, 1)
  51. end = date(year, 12, 31)
  52.  
  53. for YMD in daterange(start,end):
  54. d = str(YMD.strftime("%Y%m%d"))
  55.  
  56. total_files += 1
  57.  
  58. # File location
  59. file = ".." + fs + "Webscraper" + fs + str(year) + fs + "departmentinfo_UT_" + str(year) + "_" + str(int(YMD.strftime("%m"))) + "_" + str(int(YMD.strftime("%d"))) + ".json"
  60.  
  61. if skip_missing_dates == 1:
  62. try:
  63. Data = json.load(open(file))
  64. except FileNotFoundError:
  65. missing_files += 1
  66. print(" " + file)
  67. print(" File does not exist, so skipping it...")
  68. continue
  69. else:
  70. Data = json.load(open(file))
  71.  
  72. try:
  73. len(Data["department_info"][d])
  74. except KeyError:
  75. wrong_files += 1
  76. print(" " + file)
  77. print(" File does exist, but does not contain useful information, so skipping it")
  78. continue
  79.  
  80. for row in range(1, len(Data["department_info"][d])):
  81. month = d[4:6]
  82. day = d[-2:]
  83. try:
  84. Data["department_info"][d][str(row)]["department_nextarrival"]["time"]
  85. except KeyError:
  86. skipped_rows += 1
  87. print(" " + file)
  88. print(" Skipped the row after the following time because it doesn't exists in the json file: " + str(hour) + ":" + str(minute))
  89. continue
  90.  
  91. hour, minute = str(Data["department_info"][d][str(row)]["department_nextarrival"]["time"]).split(".")
  92. depart_delay = str(Data["department_info"][d][str(row)]["department_nextarrival"]["current_depart_delay"])
  93. arrival_delay = str(Data["department_info"][d][str(row)]["department_nextarrival"]["next_arrival_delay"])
  94. if depart_delay == "None":
  95. depart_delay = 0
  96. else:
  97. depart_delay = re.findall('\d+', depart_delay)
  98. depart_delay = depart_delay[0]
  99. if arrival_delay == "None":
  100. arrival_delay = 0
  101. else:
  102. arrival_delay = re.findall('\d+', arrival_delay)
  103. arrival_delay = arrival_delay[0]
  104.  
  105. delay = int(arrival_delay) - int(depart_delay)
  106. delay = max(0, delay)
  107.  
  108. destination = str(Data["department_info"][d][str(row)]["department_nextarrival"]["final_destination"])
  109. destination = convertDestination(destination)
  110. track = str(Data["department_info"][d][str(row)]["department_nextarrival"]["track"])
  111. if track == "None":
  112. track = 0
  113. else:
  114. track = re.findall('\d+', track)
  115. track = track[0]
  116. wind_speed = str(Data["department_info"][d][str(row)]["department_nextarrival"]["wind_speed"])
  117. temperature = str(Data["department_info"][d][str(row)]["department_nextarrival"]["temperature"])
  118. snow = str(Data["department_info"][d][str(row)]["department_nextarrival"]["snow"])
  119.  
  120. DW_npData.append(np.array([int(month), int(day), int(hour), int(minute), int(delay), int(destination), int(track), int(wind_speed), int(temperature), int(snow)]))
  121.  
  122. # Print an example of delay data for a specific time
  123. print("\nFirst 10 rows in DW_npData:")
  124. print(" mon day hr min del des tr w tem snow")
  125. for i in range(0, 10):
  126. print(" " + str(DW_npData[i]))
  127. print("\nLast 10 rows in DW_npData:")
  128. print(" mon day hr min del des tr w tem snow")
  129. for i in range(0, 10):
  130. print(" " + str(DW_npData[len(DW_npData) - 1 - (10 - i)]))
  131. print("\nNumber of rows in DW_npData:")
  132. print(" " + str(len(DW_npData)))
  133.  
  134. print("\nSaving DW_npData to file:")
  135. print(" " + loc_DW_npData)
  136. np.save(loc_DW_npData, DW_npData)
  137.  
  138. print("\nSaving destinations to file:")
  139. print(" " + loc_destinations)
  140. np.save(loc_destinations, destinations)
  141.  
  142. print("\nFrom the total of " + str(total_files) + " files:")
  143. print(" " + str(missing_files) + " missing files")
  144. print(" " + str(wrong_files) + " wrong files")
  145. print(" " + str(skipped_rows) + " skipped rows\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement