Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.decomposition import PCA
- import math
- from pandas import DataFrame
- import datetime
- from itertools import repeat
- from sklearn.ensemble import IsolationForest
- from matplotlib import pyplot as pltå
- import seaborn as sns
- '''hbos and isolation forest,
- evaluation , model performance comparison from kaggle'''
- data = pd.read_csv("truck.csv")
- print(data)
- print(data.shape)
- del data['Send_Date']
- del data['PARTITIONING']
- del data['VEHICLE_ID']
- del data['All_Fault_in_3_months']
- orig = data.copy()
- print(orig[:10])
- print(data[:10])
- iforest = IsolationForest()
- iforest.fit(data)
- iforest_result = iforest.decision_function(data)
- print(iforest_result[:10])
- iforest_orig = orig.copy()
- iforest_orig['if'] = iforest_result
- print(iforest_orig[:10])
- iforest_top1000_data=iforest_orig.sort_values(by=['if'],ascending=True)[:1000]
- print(iforest_top1000_data[:15])
- print(len(iforest_top1000_data[lambda x:x['Class']==1]))
- print(iforest_top1000_data['Class'].cumsum().sum())
- plt.scatter(range(1000),iforest_top1000_data['Class'].cumsum(),marker='1')
- plt.xlabel('Top N data')
- plt.ylabel('Anomalies found')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement