Advertisement
Guest User

Untitled

a guest
Jun 16th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.51 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.decomposition import PCA
  4. from matplotlib import pyplot as plt,style
  5. import seaborn as sns
  6. from sklearn.metrics import fbeta_score,precision_score,recall_score,confusion_matrix
  7. import itertools
  8. from sklearn.ensemble import IsolationForest
  9. from sklearn.covariance import EllipticEnvelope
  10. from sklearn.cluster import DBSCAN
  11. from sklearn.preprocessing import StandardScaler
  12. from sklearn.svm import OneClassSVM
  13. from sklearn.neighbors import LocalOutlierFactor
  14.  
  15. from sklearn.metrics import classification_report,accuracy_score
  16. style.use('ggplot')
  17. np.random.seed(42)
  18. from sklearn.utils import class_weight
  19. df = pd.read_csv("truck.csv")
  20. data = df.copy()
  21. print(data.columns)
  22. print(data)
  23. data = data.sort_values(by=['Send_Date'])
  24. print(data)
  25. output = data.iloc[:,402]
  26. print(output)
  27. print(data.shape)
  28. data = data.drop(["PARTITIONING","Send_Date","VEHICLE_ID"],axis=1)
  29. print(data.shape)
  30. print(output.shape)
  31. print(data.columns)
  32. df_data = data.iloc[:,:400]
  33. print(df_data.columns)
  34. print(df_data)
  35. print(df_data.shape)
  36. print(output.shape)
  37.  
  38. # print(output)
  39. print(type(df_data))
  40. print(type(output))
  41. # print(df_data)
  42. # df_data = df_data.iloc[:,:]
  43. # print(df_data)
  44. print(df_data.shape)
  45. diff = df_data.diff(axis=0,periods=1)
  46. # print(diff)
  47. print(diff.shape)
  48. # print(type(diff))
  49. # print(output.shape)
  50. # # output = pd.DataFrame(output)
  51. # # print(output.shape)
  52. # # print(type(output))
  53. # print(diff.shape)
  54. diff = diff.iloc[1:,:]
  55. print(diff.shape)
  56. # print(output.shape)
  57. # output=output.iloc[1:,:]
  58. # print(output.shape)
  59.  
  60. # # data = pd.concat([diff,output],axis=1)
  61. # # print(data.shape)
  62. # # # print(data)
  63. # # data = data.iloc[1:,:]
  64. # # print(data)
  65. # # print(data.columns)
  66. output = output.iloc[1:]
  67. print(output.shape)
  68. import matplotlib.pyplot as plt
  69. from sklearn.neighbors import LocalOutlierFactor
  70. from sklearn.preprocessing import scale
  71. df_data = pd.DataFrame(scale(diff))
  72. covar_matrix = PCA(n_components = 150)
  73. covar_matrix.fit(diff)
  74. variance = covar_matrix.explained_variance_ratio_
  75. var = np.cumsum(np.round(covar_matrix.explained_variance_ratio_,decimals=3)*100)
  76. print(var)
  77. plt.ylabel('% Variance Explained')
  78. plt.xlabel('# of features')
  79. plt.title('PCA features')
  80. plt.ylim(40,100.5)
  81. plt.style.context('seaborn-whitegrid')
  82. plt.plot(var)
  83. pca_116 = PCA(n_components = 26)
  84. pca_result_116 = pca_116.fit_transform(diff)
  85. print(pca_result_116.shape)
  86. print(output.shape)
  87. from sklearn import preprocessing
  88. dataset1_standardized = preprocessing.scale(pca_result_116)
  89. print(dataset1_standardized.shape)
  90. dataset1_standardized=pd.DataFrame(dataset1_standardized)
  91.  
  92. output = pd.DataFrame(output)
  93. print(output.shape)
  94. print(type(dataset1_standardized))
  95. print(type(output))
  96. data = pd.concat([dataset1_standardized,output],axis=1)
  97. print(output.shape)
  98. #print(dataset1_standardized)
  99. print(data.shape)
  100.  
  101.  
  102. data = data.iloc[:67037,:]
  103.  
  104. # print(data)
  105. print(data.shape)
  106. import numpy as np
  107. import pandas as pd
  108. from sklearn.decomposition import PCA
  109. from matplotlib import pyplot as plt,style
  110. import matplotlib.pyplot as plt
  111. from sklearn.neighbors import LocalOutlierFactor
  112. from sklearn.preprocessing import scale
  113.  
  114. from sklearn.metrics import fbeta_score,precision_score,recall_score,confusion_matrix
  115. import itertools
  116. from sklearn.ensemble import IsolationForest
  117. from sklearn.covariance import EllipticEnvelope
  118. from sklearn.cluster import DBSCAN
  119. from sklearn.preprocessing import StandardScaler
  120. from sklearn.svm import OneClassSVM
  121. from sklearn.neighbors import LocalOutlierFactor
  122. from sklearn.model_selection import KFold
  123. from sklearn.metrics import classification_report,accuracy_score
  124. style.use('ggplot')
  125. np.random.seed(42)
  126. import warnings
  127. warnings.filterwarnings("ignore")
  128. import mlxtend
  129. from mlxtend.evaluate import confusion_matrix
  130. from mlxtend.plotting import plot_confusion_matrix
  131. from sklearn.metrics import roc_auc_score
  132. from sklearn.metrics import confusion_matrix,f1_score
  133. from matplotlib import pyplot as plt,style
  134.  
  135. import matplotlib.pyplot as plt
  136. # print(data)
  137. data = data.dropna()
  138. print(data['All_Fault_in_3_months'].value_counts())
  139. print(data['All_Fault_in_3_months'].isna().sum())
  140. from sklearn.metrics import average_precision_score
  141. from sklearn.metrics import precision_recall_curve
  142. from sklearn.metrics import confusion_matrix
  143.  
  144.  
  145.  
  146.  
  147. kf = KFold(n_splits=5)
  148. # fold = 1
  149. for train_index, test_index in kf.split(data):
  150. # print('******FOLD {}******'.format(fold))
  151. data_train, data_test = data.iloc[train_index], data.iloc[test_index]
  152. # anomaly_train = data_train[data_train['All_Fault_in_3_months'] ==1]
  153. # print("Anomaly Cases : {}".format(len(anomaly_train)))
  154. # normal_train = data_train[data_train['All_Fault_in_3_months']==0]
  155. # print("Normal Cases : {}".format(len(normal_train)))
  156. # test = data_test.append(anomaly_train).sample(frac=1).reset_index(drop=True)
  157.  
  158. print('Train shape: ', data_train.shape)
  159. print('Proportion os anomaly in training set: %.2f\n' % data_train['All_Fault_in_3_months'].mean())
  160. print('Test shape:, ', data_test.shape)
  161. print('Proportion os anomaly in test set: %.2f\n' % data_test['All_Fault_in_3_months'].mean())
  162. np.random.seed(42)
  163. model = IsolationForest(random_state=32, n_jobs=4, max_samples=data_train.shape[0], bootstrap=True, contamination=0.017, n_estimators=50, max_features=1)
  164. model.fit(data_train.drop('All_Fault_in_3_months', axis=1).values)
  165. print(model.decision_function(data_test[data_test['All_Fault_in_3_months'] == 0].drop('All_Fault_in_3_months', axis=1).values).mean())
  166. print(model.decision_function(data_test[data_test['All_Fault_in_3_months'] == 1].drop('All_Fault_in_3_months', axis=1).values).mean())
  167. tresholds = np.linspace(0.02, .05, 200)
  168. y_scores = model.decision_function(data_test.drop('All_Fault_in_3_months', axis=1).values)
  169. scores = []
  170. for treshold in tresholds:
  171. y_hat = (y_scores < treshold).astype(int)
  172. y_hat_df = pd.DataFrame(y_hat)
  173. # print(y_hat_df.iloc[:,0].value_counts())
  174. # print(data_test['All_Fault_in_3_months'].value_counts())
  175. # print(precision_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months']))
  176. # print(fbeta_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months'], beta=2))
  177. scores.append([precision_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months']),
  178. fbeta_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months'], beta=2)])
  179.  
  180. # scores.append([recall_score(y_pred=y_hat, y_true=valid['All_Fault_in_3_months'].values),
  181. # precision_score(y_pred=y_hat, y_true=valid['All_Fault_in_3_months'].values),
  182. # fbeta_score(y_pred=y_hat, y_true=valid['All_Fault_in_3_months'].values, beta=2)])
  183.  
  184. scores = np.array(scores)
  185. # print(scores[:, 0].max(), scores[:, 0].argmax())
  186. # print(scores[:, 1].max(), scores[:, 1].argmax())
  187. # plt.plot(tresholds, scores[:, 0], label='$Recall$')
  188. # plt.plot(tresholds, scores[:, 1], label='$Precision$')
  189. # #plt.plot(tresholds, scores[:, 2], label='$F_2$')
  190. # plt.ylabel('Score')
  191. # plt.xlabel('Threshold')
  192. # plt.legend(loc='best')
  193. # plt.show()
  194. final_tresh = tresholds[scores[:, 0].argmax()]
  195. # print(final_tresh)
  196. y_hat_test = (model.decision_function(data_test.drop('All_Fault_in_3_months', axis=1).values) < 0.02).astype(int)
  197. # print(y_hat_test)
  198. # print(data_test['All_Fault_in_3_months'].values)
  199.  
  200. print('Final threshold: %f' % final_tresh)
  201. average_precision = average_precision_score(data_test['All_Fault_in_3_months'].values, y_hat_test)
  202. print("\n")
  203. precision, recall, _ = precision_recall_curve(data_test['All_Fault_in_3_months'].values, y_hat_test)
  204. print('Average precision-recall score: {0:0.2f}'.format(average_precision))
  205. print('Test Recall Score: %.3f' % recall_score(y_pred=y_hat_test, y_true=data_test['All_Fault_in_3_months'].values))
  206. print('Test Precision Score: %.3f' % precision_score(y_pred=y_hat_test, y_true=data_test['All_Fault_in_3_months'].values))
  207. print()
  208. print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test, y_true=data_test['All_Fault_in_3_months'].values, beta=2))
  209. print(confusion_matrix(data_test['All_Fault_in_3_months'].values, y_hat_test))
  210. # cm = confusion_matrix(y_target=data_test['All_Fault_in_3_months'].values,
  211. # y_predicted=y_hat_test)
  212. # fig, ax = plot_confusion_matrix(conf_mat=cm)
  213. # plt.show()
  214. print(classification_report(data_test['All_Fault_in_3_months'].values, y_hat_test))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement