Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from sklearn.decomposition import PCA
- from matplotlib import pyplot as plt,style
- import seaborn as sns
- from sklearn.metrics import fbeta_score,precision_score,recall_score,confusion_matrix
- import itertools
- from sklearn.ensemble import IsolationForest
- from sklearn.covariance import EllipticEnvelope
- from sklearn.cluster import DBSCAN
- from sklearn.preprocessing import StandardScaler
- from sklearn.svm import OneClassSVM
- from sklearn.neighbors import LocalOutlierFactor
- from sklearn.metrics import classification_report,accuracy_score
- style.use('ggplot')
- np.random.seed(42)
- from sklearn.utils import class_weight
- df = pd.read_csv("truck.csv")
- data = df.copy()
- print(data.columns)
- print(data)
- data = data.sort_values(by=['Send_Date'])
- print(data)
- output = data.iloc[:,402]
- print(output)
- print(data.shape)
- data = data.drop(["PARTITIONING","Send_Date","VEHICLE_ID"],axis=1)
- print(data.shape)
- print(output.shape)
- print(data.columns)
- df_data = data.iloc[:,:400]
- print(df_data.columns)
- print(df_data)
- print(df_data.shape)
- print(output.shape)
- # print(output)
- print(type(df_data))
- print(type(output))
- # print(df_data)
- # df_data = df_data.iloc[:,:]
- # print(df_data)
- print(df_data.shape)
- diff = df_data.diff(axis=0,periods=1)
- # print(diff)
- print(diff.shape)
- # print(type(diff))
- # print(output.shape)
- # # output = pd.DataFrame(output)
- # # print(output.shape)
- # # print(type(output))
- # print(diff.shape)
- diff = diff.iloc[1:,:]
- print(diff.shape)
- # print(output.shape)
- # output=output.iloc[1:,:]
- # print(output.shape)
- # # data = pd.concat([diff,output],axis=1)
- # # print(data.shape)
- # # # print(data)
- # # data = data.iloc[1:,:]
- # # print(data)
- # # print(data.columns)
- output = output.iloc[1:]
- print(output.shape)
- import matplotlib.pyplot as plt
- from sklearn.neighbors import LocalOutlierFactor
- from sklearn.preprocessing import scale
- df_data = pd.DataFrame(scale(diff))
- covar_matrix = PCA(n_components = 150)
- covar_matrix.fit(diff)
- variance = covar_matrix.explained_variance_ratio_
- var = np.cumsum(np.round(covar_matrix.explained_variance_ratio_,decimals=3)*100)
- print(var)
- plt.ylabel('% Variance Explained')
- plt.xlabel('# of features')
- plt.title('PCA features')
- plt.ylim(40,100.5)
- plt.style.context('seaborn-whitegrid')
- plt.plot(var)
- pca_116 = PCA(n_components = 26)
- pca_result_116 = pca_116.fit_transform(diff)
- print(pca_result_116.shape)
- print(output.shape)
- from sklearn import preprocessing
- dataset1_standardized = preprocessing.scale(pca_result_116)
- print(dataset1_standardized.shape)
- dataset1_standardized=pd.DataFrame(dataset1_standardized)
- output = pd.DataFrame(output)
- print(output.shape)
- print(type(dataset1_standardized))
- print(type(output))
- data = pd.concat([dataset1_standardized,output],axis=1)
- print(output.shape)
- #print(dataset1_standardized)
- print(data.shape)
- data = data.iloc[:67037,:]
- # print(data)
- print(data.shape)
- import numpy as np
- import pandas as pd
- from sklearn.decomposition import PCA
- from matplotlib import pyplot as plt,style
- import matplotlib.pyplot as plt
- from sklearn.neighbors import LocalOutlierFactor
- from sklearn.preprocessing import scale
- from sklearn.metrics import fbeta_score,precision_score,recall_score,confusion_matrix
- import itertools
- from sklearn.ensemble import IsolationForest
- from sklearn.covariance import EllipticEnvelope
- from sklearn.cluster import DBSCAN
- from sklearn.preprocessing import StandardScaler
- from sklearn.svm import OneClassSVM
- from sklearn.neighbors import LocalOutlierFactor
- from sklearn.model_selection import KFold
- from sklearn.metrics import classification_report,accuracy_score
- style.use('ggplot')
- np.random.seed(42)
- import warnings
- warnings.filterwarnings("ignore")
- import mlxtend
- from mlxtend.evaluate import confusion_matrix
- from mlxtend.plotting import plot_confusion_matrix
- from sklearn.metrics import roc_auc_score
- from sklearn.metrics import confusion_matrix,f1_score
- from matplotlib import pyplot as plt,style
- import matplotlib.pyplot as plt
- # print(data)
- data = data.dropna()
- print(data['All_Fault_in_3_months'].value_counts())
- print(data['All_Fault_in_3_months'].isna().sum())
- from sklearn.metrics import average_precision_score
- from sklearn.metrics import precision_recall_curve
- from sklearn.metrics import confusion_matrix
- kf = KFold(n_splits=5)
- # fold = 1
- for train_index, test_index in kf.split(data):
- # print('******FOLD {}******'.format(fold))
- data_train, data_test = data.iloc[train_index], data.iloc[test_index]
- # anomaly_train = data_train[data_train['All_Fault_in_3_months'] ==1]
- # print("Anomaly Cases : {}".format(len(anomaly_train)))
- # normal_train = data_train[data_train['All_Fault_in_3_months']==0]
- # print("Normal Cases : {}".format(len(normal_train)))
- # test = data_test.append(anomaly_train).sample(frac=1).reset_index(drop=True)
- print('Train shape: ', data_train.shape)
- print('Proportion os anomaly in training set: %.2f\n' % data_train['All_Fault_in_3_months'].mean())
- print('Test shape:, ', data_test.shape)
- print('Proportion os anomaly in test set: %.2f\n' % data_test['All_Fault_in_3_months'].mean())
- np.random.seed(42)
- model = IsolationForest(random_state=32, n_jobs=4, max_samples=data_train.shape[0], bootstrap=True, contamination=0.017, n_estimators=50, max_features=1)
- model.fit(data_train.drop('All_Fault_in_3_months', axis=1).values)
- print(model.decision_function(data_test[data_test['All_Fault_in_3_months'] == 0].drop('All_Fault_in_3_months', axis=1).values).mean())
- print(model.decision_function(data_test[data_test['All_Fault_in_3_months'] == 1].drop('All_Fault_in_3_months', axis=1).values).mean())
- tresholds = np.linspace(0.02, .05, 200)
- y_scores = model.decision_function(data_test.drop('All_Fault_in_3_months', axis=1).values)
- scores = []
- for treshold in tresholds:
- y_hat = (y_scores < treshold).astype(int)
- y_hat_df = pd.DataFrame(y_hat)
- # print(y_hat_df.iloc[:,0].value_counts())
- # print(data_test['All_Fault_in_3_months'].value_counts())
- # print(precision_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months']))
- # print(fbeta_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months'], beta=2))
- scores.append([precision_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months']),
- fbeta_score(y_pred=y_hat, y_true=data_test['All_Fault_in_3_months'], beta=2)])
- # scores.append([recall_score(y_pred=y_hat, y_true=valid['All_Fault_in_3_months'].values),
- # precision_score(y_pred=y_hat, y_true=valid['All_Fault_in_3_months'].values),
- # fbeta_score(y_pred=y_hat, y_true=valid['All_Fault_in_3_months'].values, beta=2)])
- scores = np.array(scores)
- # print(scores[:, 0].max(), scores[:, 0].argmax())
- # print(scores[:, 1].max(), scores[:, 1].argmax())
- # plt.plot(tresholds, scores[:, 0], label='$Recall$')
- # plt.plot(tresholds, scores[:, 1], label='$Precision$')
- # #plt.plot(tresholds, scores[:, 2], label='$F_2$')
- # plt.ylabel('Score')
- # plt.xlabel('Threshold')
- # plt.legend(loc='best')
- # plt.show()
- final_tresh = tresholds[scores[:, 0].argmax()]
- # print(final_tresh)
- y_hat_test = (model.decision_function(data_test.drop('All_Fault_in_3_months', axis=1).values) < 0.02).astype(int)
- # print(y_hat_test)
- # print(data_test['All_Fault_in_3_months'].values)
- print('Final threshold: %f' % final_tresh)
- average_precision = average_precision_score(data_test['All_Fault_in_3_months'].values, y_hat_test)
- print("\n")
- precision, recall, _ = precision_recall_curve(data_test['All_Fault_in_3_months'].values, y_hat_test)
- print('Average precision-recall score: {0:0.2f}'.format(average_precision))
- print('Test Recall Score: %.3f' % recall_score(y_pred=y_hat_test, y_true=data_test['All_Fault_in_3_months'].values))
- print('Test Precision Score: %.3f' % precision_score(y_pred=y_hat_test, y_true=data_test['All_Fault_in_3_months'].values))
- print()
- print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test, y_true=data_test['All_Fault_in_3_months'].values, beta=2))
- print(confusion_matrix(data_test['All_Fault_in_3_months'].values, y_hat_test))
- # cm = confusion_matrix(y_target=data_test['All_Fault_in_3_months'].values,
- # y_predicted=y_hat_test)
- # fig, ax = plot_confusion_matrix(conf_mat=cm)
- # plt.show()
- print(classification_report(data_test['All_Fault_in_3_months'].values, y_hat_test))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement