Guest User

Untitled

a guest
May 22nd, 2017
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.95 KB | None | 0 0
  1.  
  2. import pandas as pd
  3. import csv
  4. import numpy as np
  5. from sklearn.neighbors import KNeighborsClassifier
  6. from sklearn.neighbors import KNeighborsRegressor
  7. from sklearn.neural_network import MLPClassifier
  8. from sklearn.neural_network import MLPRegressor
  9. from geopy.distance import vincenty
  10. from pyproj import Proj
  11. from math import radians, cos, sin, asin, sqrt
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.svm import SVR
  14.  
  15.  
  16. def haversine(lon1, lat1, lon2, lat2):
  17.     """
  18.    Calculate the great circle distance between two points
  19.    on the earth (specified in decimal degrees)
  20.    """
  21.     # convert decimal degrees to radians
  22.     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
  23.  
  24.     # haversine formula
  25.     dlon = lon2 - lon1
  26.     dlat = lat2 - lat1
  27.     a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
  28.     c = 2 * asin(sqrt(a))
  29.     r = 6371 # Radius of earth in kilometers. Use 3956 for miles
  30.     return c * r *1000 #return in meters
  31. #---------------------------------------------------------------------------------------------------------------      
  32. def regression_allset(Y_test_lon,Y_test_lat,X_test,ml_lon,ml_lat): #Only for tests
  33.                                                
  34.     #Turn into list
  35.     predicts_lon = ml_lon.predict(X_test)
  36.     predicts_lat = ml_lat.predict(X_test)
  37.  
  38.  
  39.  
  40.     error = []
  41.  
  42.     for j in range(len(X_test)):
  43.            
  44.         #change the latitude and longitude unit
  45.         myProj = Proj("+proj=utm +zone=23K, +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
  46.         lon_pred,lat_pred = myProj(predicts_lon[j], predicts_lat[j], inverse=True)
  47.         lon_Y, lat_Y = myProj(Y_test_lon[j], Y_test_lat[j], inverse=True)
  48.  
  49.         #join in a unique list
  50.         Y = []
  51.         Y.append(lon_Y)
  52.         Y.append(lat_Y)
  53.         predict = []
  54.         predict.append(lon_pred)
  55.         predict.append(lat_pred)           
  56.  
  57.         #The distance between the two latitudes is the error
  58.         distance = vincenty(Y, predict).meters
  59.  
  60.         #If you want to use haversine distance, uncomment the line below
  61. #       distance = haversine(lon_Y, lat_Y, lon_pred, lat_pred)     
  62.  
  63.  
  64.         error.append(distance) 
  65.  
  66.    
  67.     return np.mean(error)
  68. #--------------------------------------------------------------------------------------------------------------
  69. #Calculate how many measurements each cell phone has
  70. def show_number_measurements(grouped_df):
  71.     for i in range(len(grouped_df)):
  72.         print "Measures:" + str(len(grouped_df[i][1])) + ", PHONEID" + str((grouped_df[i][1]).PHONEID.unique())
  73.     print "\n"
  74. #---------------------------------------------------------------------------------------------------------------
  75.  
  76. #Create a list of data frames. Each smartphone has its own data frame
  77. def create_phone_df(df,grouped_df):
  78.     list_phones = df.PHONEID.unique()
  79.     df_phone = []
  80.  
  81.     j=0
  82.     for i in range(0,24):
  83.         if (i in list_phones):
  84.             df_phone.append(grouped_df[j][1])
  85.             j=j+1
  86.         else:
  87.             df_phone.append([])
  88.  
  89.     return df_phone, list_phones
  90. #---------------------------------------------------------------------------------------------------------------
  91.  
  92. def undersampling(df_phone, phones_used):
  93.  
  94.     minimum = 10000000
  95.     und_df_phone = []
  96.  
  97.     for i in phones_used:
  98.        
  99.         #find the smaller data frame
  100.         if(len(df_phone[i]) < minimum):
  101.             minimum = len(df_phone[i])
  102.             ind_min = i
  103.  
  104.     #unsampling the others data frames so they are the same size       
  105.     for i in phones_used:
  106.         if(i != ind_min):
  107.             und_df_phone.append(df_phone[i].sample(n=minimum))
  108.         else:
  109.             und_df_phone.append(df_phone[i])   
  110.  
  111.     return und_df_phone
  112.  
  113. #---------------------------------------------------------------------------------------------------------------
  114. def shuffle(und_df_phone):
  115.  
  116.     for i in range(len(und_df_phone)):
  117.         und_df_phone[i] = und_df_phone[i].sample(frac=1)
  118.  
  119.     return und_df_phone  
  120.  
  121. #---------------------------------------------------------------------------------------------------------------
  122. def init_list_of_objects(size):
  123.     list_of_objects = list()
  124.     for i in range(0,size):
  125.         list_of_objects.append( list() ) #different object reference each time
  126.     return list_of_objects
  127. #---------------------------------------------------------------------------------------------------------------
  128. #return the number of hits
  129. def compare(Y_test_build, predictions_build, Y_test_floor, predictions_floor):
  130.  
  131.     hits = 0
  132.     #if tests and predictions have the same number of building and the same number of floor, the algorithm hit
  133.     for i in range(len(Y_test_floor)):
  134.         if(Y_test_build[i] == predictions_build[i] and Y_test_floor[i] == predictions_floor[i]):
  135.             hits = hits +1
  136.  
  137.     return hits
  138. #---------------------------------------------------------------------------------------------------------------
  139. #reorder the list
  140. def put_list(pred_old, index, pred_new):
  141.  
  142.     for i in range(len(index)):
  143.         pred_new[index[i]] = pred_old[i]
  144.  
  145.     return pred_new
  146.  
  147. #---------------------------------------------------------------------------------------------------------------
  148. def floor_classifier(predictions,train,test,method):
  149.        
  150.     successful_amount = 0
  151.     pred_floor_ordered = init_list_of_objects(len(predictions))
  152.  
  153.     if(method==1):
  154.         machine_learn = KNeighborsClassifier(n_neighbors=5, weights = 'distance')
  155.     elif(method==2):
  156.         #machine_learn = MLPClassifier(solver='sgd',learning_rate = 'adaptive',verbose='true',activation='tanh',alpha=1e-5)    
  157.         machine_learn = MLPClassifier(solver='sgd',learning_rate = 'adaptive',verbose='false',activation='tanh',alpha=1e-5,max_iter=400) #THE BEST
  158.         #machine_learn = MLPClassifier(hidden_layer_sizes=(100,5), solver='sgd',learning_rate = 'adaptive',verbose='true',activation='tanh',alpha=1e-5,max_iter=500)
  159.         #model = MLPClassifier(learning_rate = 'adaptive')
  160.         #solvers = ['lbfgs', 'sgd', 'adam']
  161.         #activations = ['identity', 'logistic', 'tanh', 'relu']
  162.         #max_its = [200,400,600]
  163.         #machine_learn = GridSearchCV(estimator=model, param_grid=dict(activation =activations,max_iter=max_its),n_jobs=7) #GRID
  164.  
  165.  
  166.     #for each building
  167.     for i in range(3):
  168.        
  169.         new_train = train.loc[train['BUILDINGID'] == i] #select for training only buildings with that label (0,1, or 2)
  170.         indexes = [x for x in range(len(predictions)) if predictions[x]==i] #get the position of the samples that have building == i
  171.        
  172.         if (indexes): #if list is not empty
  173.             #training, samples with building == i
  174.             X_train = new_train.ix[:,0:519]
  175.             Y_train = new_train['FLOOR']
  176.             machine_learn.fit(X_train,Y_train)                                  
  177.            
  178.             #testing samples w ith prediction building == i
  179.             new_test = test.iloc[indexes,:]
  180.             X_test = new_test.ix[:,0:519]
  181.  
  182.             Y_test_floor = new_test['FLOOR']
  183.             Y_test_build = new_test['BUILDINGID']
  184.             #if(method ==2):
  185.                 #print "best score:"
  186.                 #print machine_learn.best_score_
  187.             predictions_floor = machine_learn.predict(X_test)
  188.             pred_floor_ordered = put_list(predictions_floor, indexes, pred_floor_ordered)
  189.  
  190.             #Accumulate the number of hits
  191.             successful_amount = compare(Y_test_build.tolist(), predictions[indexes].tolist(), Y_test_floor.tolist(), predictions_floor.tolist()) + successful_amount
  192.    
  193.     return successful_amount/float(len(test)), pred_floor_ordered      
  194.  
  195. #---------------------------------------------------------------------------------------------------------------
  196. def coord_regression(predictions_b,predictions,train,test,method):
  197.        
  198.     mean_error = []
  199.  
  200.     if(method==1):
  201.         machine_learn = KNeighborsRegressor(n_neighbors=5, weights = 'distance')
  202.     elif(method==2):
  203.         #machine_learn = MLPClassifier(solver='sgd',learning_rate = 'adaptive',verbose='true',activation='tanh',alpha=1e-5)    
  204.         machine_learn = MLPClassifier(solver='sgd',learning_rate = 'adaptive',verbose='false',activation='tanh',alpha=1e-5,max_iter=400) #THE BEST
  205.         #machine_learn = MLPClassifier(hidden_layer_sizes=(100,5), solver='sgd',learning_rate = 'adaptive',verbose='true',activation='tanh',alpha=1e-5,max_iter=500)
  206.         #model = MLPClassifier(learning_rate = 'adaptive')
  207.         #solvers = ['lbfgs', 'sgd', 'adam']
  208.         #activations = ['identity', 'logistic', 'tanh', 'relu']
  209.         #max_its = [200,400,600]
  210.         #machine_learn = GridSearchCV(estimator=model, param_grid=dict(activation =activations,max_iter=max_its),n_jobs=7) #GRID
  211.  
  212.             #for each building
  213.     for j in range(3):
  214.         new_train1 = train.loc[train['BUILDINGID'] == j] #select for training only buildings with that label (0,1, or 2)
  215.         ind = [x for x in range(len(predictions_b)) if predictions_b[x]==j] #get the position of the samples that have building == i   
  216.         new_test1 = test.iloc[ind,:]
  217.  
  218.         if(ind):
  219.         #for each floor
  220.             for i in range(5):
  221.                
  222.                 new_train2 = new_train1.loc[new_train1['FLOOR'] == i]
  223.                 if(not new_train2.empty):
  224.                     indexes = [x for x in range(len(predictions)) if (predictions[x]==i and predictions_b[x]==j)] #get the position of the samples that have building == i
  225.                 else:
  226.                     index = []
  227.  
  228.                 if (indexes): #if list is not empty
  229.  
  230.                     X_train = new_train2.ix[:,0:519]
  231.                     Y_train = new_train2[['LONGITUDE','LATITUDE']]
  232.                     machine_learn.fit(X_train,Y_train)                                  
  233.                    
  234.                     #testing samples with prediction building == i
  235.                     new_test2 = test.iloc[indexes,:]
  236.                     X_test = new_test2.ix[:,0:519]
  237.                     Y_test = new_test2[['LONGITUDE','LATITUDE']]
  238.  
  239.                     #Turn into list
  240.                     predicts_lon_lat = machine_learn.predict(X_test).tolist()
  241.                     Y_test = Y_test.values.tolist()
  242.  
  243.                     distance = []
  244.                     for j in range(len(predicts_lon_lat)):
  245.                    
  246.                         #change the latitude and longitude unit
  247.                         myProj = Proj("+proj=utm +zone=23K, +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
  248.                         lon_pred,lat_pred = myProj(predicts_lon_lat[j][0], predicts_lon_lat[j][1], inverse=True)
  249.                         lon_Y, lat_Y = myProj(Y_test[j][0], Y_test[j][1], inverse=True)
  250.                    
  251.                         #join in a unique list
  252.                         Y = []
  253.                         Y.append(lon_Y)
  254.                         Y.append(lat_Y)
  255.                         predict = []
  256.                         predict.append(lon_pred)
  257.                         predict.append(lat_pred)           
  258.  
  259.                         #The distance between the two latitudes is the error
  260.                         distance.append(vincenty(Y, predict).meters)
  261.                         print "distance"
  262.                         print distance
  263.                         #If you want to use haversine distance, uncomment the line below
  264.                         #print haversine(lon_Y, lat_Y, lon_pred, lat_pred)
  265.  
  266.                     mean_error.append(np.mean(distance))   
  267.                     #print(np.mean(distance))
  268.        
  269.     return np.mean(mean_error)
  270. #---------------------------------------------------------------------------------------------------------------
  271.  
  272. def regression_subset(predictions,train,test,method):
  273.    
  274.    
  275.     mean_error = []
  276.     if(method==1):
  277.         machine_learn = KNeighborsRegressor(n_neighbors=5, weights = 'distance')
  278.     elif(method==2):
  279.         machine_learn = MLPRegressor(random_state=0)
  280.     #for each building
  281.     for i in range(3):
  282.        
  283.         new_train = train.loc[train['BUILDINGID'] == i] #select for training only buildings with that label (0,1, or 2)
  284.         indexes = [x for x in range(len(predictions)) if predictions[x]==i] #get the position of the samples that have building == i
  285.  
  286.        
  287.         if (indexes): #if list is not empty
  288.             #training, samples with building == i
  289.             X_train = new_train.ix[:,0:519]
  290.             Y_train = new_train[['LONGITUDE','LATITUDE']]
  291.             machine_learn.fit(X_train,Y_train)
  292.        
  293.             #testing samples with prediction building == i
  294.             new_test = test.iloc[indexes,:]
  295.             X_test = new_test.ix[:,0:519]
  296.             Y_test = new_test[['LONGITUDE','LATITUDE']]
  297.  
  298.             #Turn into list
  299.             predicts_lon_lat = machine_learn.predict(X_test).tolist()
  300.             Y_test = Y_test.values.tolist()
  301.  
  302.             distance = []
  303.             for j in range(len(predicts_lon_lat)):
  304.            
  305.                 #change the latitude and longitude unit
  306.                 myProj = Proj("+proj=utm +zone=23K, +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
  307.                 lon_pred,lat_pred = myProj(predicts_lon_lat[j][0], predicts_lon_lat[j][1], inverse=True)
  308.                 lon_Y, lat_Y = myProj(Y_test[j][0], Y_test[j][1], inverse=True)
  309.            
  310.                 #join in a unique list
  311.                 Y = []
  312.                 Y.append(lon_Y)
  313.                 Y.append(lat_Y)
  314.                 predict = []
  315.                 predict.append(lon_pred)
  316.                 predict.append(lat_pred)           
  317.  
  318.                 #The distance between the two latitudes is the error
  319.                 distance.append(vincenty(Y, predict).meters)
  320.  
  321.                 #If you want to use haversine distance, uncomment the line below
  322.                 #print haversine(lon_Y, lat_Y, lon_pred, lat_pred)
  323.  
  324.             mean_error.append(np.mean(distance))   
  325.             #print(np.mean(distance))
  326.    
  327.     return np.mean(mean_error)
  328.  
  329. #---------------------------------------------------------------------------------------------------------------
  330.  
  331. def save_vec(hit_rate_build_mlp,hit_rate_floor_mlp,hit_rate_build_knn, hit_rate_floor_knn):
  332.  
  333.     np.save("build_mlp.npy",hit_rate_build_mlp)
  334.     np.save("floor_mlp.npy",hit_rate_floor_mlp)
  335.  
  336.     np.save("build_knn.npy",hit_rate_build_knn)
  337.     np.save("floor_knn.npy",hit_rate_floor_knn)
  338. #---------------------------------------------------------------------------------------------------------------
  339. def load_vec():
  340.     hit_rate_build_mlp = np.load("build_mlp.npy")
  341.     hit_rate_floor_mlp = np.load("floor_mlp.npy")
  342.  
  343.     hit_rate_build_knn = np.load("build_knn.npy")
  344.     hit_rate_floor_knn = np.load("floor_knn.npy")
  345.  
  346. #---------------------------------------------------------------------------------------------------------------
  347.  
  348. def KFold(k, und_df_phone):
  349.  
  350.     #und_df_phone = shuffle(und_df_phone)
  351.     phone = []
  352.    
  353.     #split the data frame of each smartphone
  354.     for j in range(len(und_df_phone)):
  355.         phone.append(np.array_split(und_df_phone[j],k)) #the first dimension of "phone" is each phone, the second is the splits data frames from that smatphone
  356.  
  357.         model = MLPRegressor(learning_rate = 'adaptive')
  358.         solvers = ['lbfgs', 'sgd', 'adam']
  359.         activations = ['identity', 'logistic', 'tanh']
  360.         max_its = [200,400,600]
  361.         mlp_lon = GridSearchCV(estimator=model, param_grid=dict(solver = solvers,activation =activations,max_iter=max_its),n_jobs=4) #GRID
  362.         mlp_lat = GridSearchCV(estimator=model, param_grid=dict(solver = solvers,activation =activations,max_iter=max_its),n_jobs=4) #GRID
  363.    
  364.  
  365.     #creating a empty list with size len(und_df_phone)
  366.     mean_error_mlp = init_list_of_objects(len(und_df_phone))
  367.  
  368.  
  369.     for i in range(k):
  370.         #separate each smartphone's data frame in test and train
  371.         test = [] #list of data frames
  372.         train =pd.DataFrame()      
  373.         for j in range(len(und_df_phone)):
  374.             test.append(phone[j][i])
  375.             #Join the train set
  376.             for x in range(k):
  377.                 if x != i:
  378.                     train = pd.concat([train,phone[j][x]]) 
  379.        
  380.         #Training with total training set              
  381.         X_train = train.ix[:,0:519].values
  382.         Y_train_lon = train['LONGITUDE'].values.tolist()
  383.         Y_train_lat = train['LATITUDE'].values.tolist()
  384.  
  385.  
  386.         mlp_lon.fit(X_train,Y_train_lon)
  387.         mlp_lat.fit(X_train,Y_train_lat)  
  388.  
  389.  
  390.         #test all phones
  391.         for j in range(len(und_df_phone)):
  392.             #only pick up from test set the phone that you will be evaluated
  393.             data_test = test[j].ix[:,0:519].values.tolist()
  394.             Y_test_lon = test[j]['LONGITUDE'].values.tolist()
  395.             Y_test_lat = test[j]['LATITUDE'].values.tolist()
  396.  
  397.  
  398.             mean_error_mlp[j].append( regression_allset(Y_test_lon,Y_test_lat,data_test,mlp_lon,mlp_lat) )
  399.  
  400.  
  401.     np.save("mean_error_mlp.npy", mean_error_mlp)      
  402.     print "mean error regression MLP"
  403.     print str(np.mean(mean_error_mlp[0])) + " - " +  str(np.std(mean_error_mlp[0]))
  404.     print str(np.mean(mean_error_mlp[1])) + " - " +  str(np.std(mean_error_mlp[1]))
  405.     print str(np.mean(mean_error_mlp[2])) + " - " +  str(np.std(mean_error_mlp[2]))
  406.     print str(np.mean(mean_error_mlp[3])) + " - " +  str(np.std(mean_error_mlp[3]))
  407.     print " "      
  408.  
  409.     print "Best Params"
  410.     print svm_floor.best_params_
  411. #---------------------------------------------------------------------------------------------------------------
  412. def main():
  413.  
  414.     #defines
  415.     phones_used = [6,7,13,14]
  416.     k=10
  417.  
  418.     #convert csv file in an data frame
  419.     df = pd.read_csv('trainingData.csv')
  420.     print df.isnull().any()
  421.  
  422.     #group by pohneID
  423.     grouped_df = list(df.groupby(['PHONEID']))
  424.  
  425.     #show_number_measurements(grouped_df)
  426.  
  427.     #create a data frame for each phone
  428.     df_phone, list_phones = create_phone_df(df,grouped_df)
  429.    
  430.     #Doing undersampling
  431.     und_df_phone = undersampling(df_phone,phones_used)
  432.  
  433.     KFold(k, und_df_phone)
  434.  
  435.  
  436.  
  437. if __name__ == "__main__":
  438.     main()
  439.  
  440.  
  441. #to do a list of data frames. Each smartphone has its own data frame
Advertisement
Add Comment
Please, Sign In to add comment