Advertisement
Guest User

Untitled

a guest
May 23rd, 2018
403
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.64 KB | None | 0 0
  1. # https://github.com/sabbadini/FlightDelay/blob/master/flights.py
  2. # inspired by https://gist.github.com/martinwicke/6838c23abdc53e6bcda36ed9f40cff39
  3.  
  4. from __future__ import print_function
  5. from __future__ import division
  6. from __future__ import absolute_import
  7.  
  8. # We're using pandas to read the CSV file. This is easy for small datasets, but for large and complex datasets,
  9. # tensorflow parsing and processing functions are more powerful.
  10. import pandas as pd
  11. import numpy as np
  12.  
  13. import tensorflow as tf
  14. import glob
  15. #import cloudstorage as gcs
  16. import os
  17. from numpy import nan
  18.  
  19. #from google.appengine.api import app_identity
  20.  
  21. # We removed the csv header as part of the ETL proces so we'll define them here.
  22. names = [
  23.     'FL_DATE',
  24.     'UNIQUE_CARRIER',
  25.     'AIRLINE_ID',
  26.     'CARRIER',
  27.     'FL_NUM',
  28.     'ORIGIN_AIRPORT_ID',
  29.     'ORIGIN_AIRPORT_SEQ_ID',
  30.     'ORIGIN_CITY_MARKET_ID',
  31.     'ORIGIN',
  32.     'DEST_AIRPORT_ID',
  33.     'DEST_AIRPORT_SEQ_ID',
  34.     'DEST_CITY_MARKET_ID',
  35.     'DEST',
  36.     'CRS_DEP_TIME',
  37.     'DEP_TIME',
  38.     'DEP_DELAY',
  39.     'TAXI_OUT',
  40.     'WHEELS_OFF',
  41.     'WHEELS_ON',
  42.     'TAXI_IN',
  43.     'CRS_ARR_TIME',
  44.     'ARR_TIME',
  45.     'ARR_DELAY',
  46.     'CANCELLED',
  47.     'CANCELLATION_CODE',
  48.     'DIVERTED',
  49.     'DISTANCE'
  50. ]
  51.  
  52. # Here we'll specify the dtypes.
  53. dtypes = {
  54.     'FL_DATE': str,
  55.     'UNIQUE_CARRIER': str,
  56.     'AIRLINE_ID': np.float32,
  57.     'CARRIER': str,
  58.     'FL_NUM': np.float32,
  59.     'ORIGIN_AIRPORT_ID': np.float32,
  60.     'ORIGIN_AIRPORT_SEQ_ID': np.float32,
  61.     'ORIGIN_CITY_MARKET_ID': np.float32,
  62.     'ORIGIN': str,
  63.     'DEST_AIRPORT_ID': np.float32,
  64.     'DEST_AIRPORT_SEQ_ID': np.float32,
  65.     'DEST_CITY_MARKET_ID': np.float32,
  66.     'DEST': str,
  67.     'CRS_DEP_TIME': np.float32,
  68.     'DEP_TIME': np.float32,
  69.     'DEP_DELAY': np.float32,
  70.     'TAXI_OUT': np.float32,
  71.     'WHEELS_OFF': np.float32,
  72.     'WHEELS_ON': np.float32,
  73.     'TAXI_IN': np.float32,
  74.     'CRS_ARR_TIME': np.float32,
  75.     'ARR_TIME': np.float32,
  76.     'ARR_DELAY': np.float32,
  77.     'CANCELLED': np.float32,
  78.     'CANCELLATION_CODE': str,
  79.     'DIVERTED': np.float32,
  80.     'DISTANCE': np.float32,
  81. }
  82.  
  83. path = '201701.csv' # use your path
  84. # allFiles = glob.glob(path + "01/*.csv")
  85. # print("printing allfiles")
  86. # print(allFiles)
  87. # frame = pd.DataFrame()
  88. # list_ = []
  89. # for file_ in allFiles:
  90. #     print(file_)
  91. #     df = pd.read_csv(file_,index_col=None, header=0)
  92. #     list_.append(df)
  93. # frame = pd.concat(list_)
  94.  
  95. # Read the file.
  96.  
  97. #df = pd.concat([pd.read_csv(f) for f in glob.glob(path +'*.csv')], names=names, ignore_index = True)
  98. df = pd.read_csv(path, header=1, names=names, dtype=dtypes, na_values='?')
  99.  
  100. df.info()
  101.  
  102. # Split the data into a training set and an eval set.
  103.  
  104. training_data = df[:465]
  105. eval_data = df[465:]
  106. test_data = df[:10]
  107.  
  108. #training_data, training_label = df, df.pop(ARR_TIME)
  109. training_label = training_data.pop('FL_NUM')
  110. eval_label = eval_data.pop('FL_NUM')
  111. test_label = test_data.pop('FL_NUM')
  112.  
  113. training_input_fn = tf.estimator.inputs.pandas_input_fn(x=training_data, y=training_label, batch_size=None, shuffle=True, num_epochs=None)
  114.  
  115. eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=eval_data, y=eval_label, batch_size=64, shuffle=False)
  116.  
  117. test_input_fn = tf.estimator.inputs.pandas_input_fn(x=test_data, y=test_label, batch_size=10, shuffle=False)
  118.  
  119. #Feature columns
  120. carrier = tf.feature_column.categorical_column_with_vocabulary_list(key='CARRIER', vocabulary_list=['WN', 'OO', 'NK', 'AA', 'DL', 'UA'])
  121. distance = tf.feature_column.numeric_column(key='DISTANCE')
  122. dep_delay = tf.feature_column.numeric_column(key='DEP_DELAY')
  123.  
  124. #Linear Regressor
  125.  
  126. linear_features = [carrier, distance, dep_delay]
  127. regressor = tf.estimator.LinearRegressor(feature_columns=linear_features)
  128. regressor.train(input_fn=training_input_fn, steps=10000)
  129. regressor.evaluate(input_fn=eval_input_fn)
  130.  
  131. #Deep Neural Network
  132.  
  133. dnn_features = [
  134.     #numerical features
  135.     distance, fl_num, dep_delay,
  136.     # densify categorical features:
  137.     tf.feature_column.indicator_column(carrier),
  138. ]
  139.  
  140. dnnregressor = tf.contrib.learn.DNNRegressor(feature_columns=dnn_features, hidden_units=[50, 30, 10])
  141. dnnregressor.fit(input_fn=training_input_fn, steps=10000)
  142. dnnregressor.evaluate(input_fn=eval_input_fn)
  143.  
  144. #Predict
  145.  
  146. predictions = list(dnnregressor.predict_scores(input_fn=training_input_fn))
  147. print(predictions)
  148.  
  149. predictionsLarge = list(dnnregressor.predict_scores(input_fn=eval_input_fn))
  150. print(predictionsLarge)
  151.  
  152. predictionsLinear = list(regressor.predict_scores(input_fn=test_input_fn))
  153. print(predictionsLinear)
  154.  
  155. predictionsLinearLarge = list(regressor.predict_scores(input_fn=eval_input_fn))
  156. print(predictionsLinearLarge)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement