Untitled

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import math
from pandas import DataFrame
import datetime
from itertools import repeat
from sklearn.ensemble import IsolationForest
from matplotlib import pyplot as pltå
import seaborn as sns

'''hbos and isolation forest,
evaluation , model performance comparison from kaggle'''

data = pd.read_csv("truck.csv")
print(data)
print(data.shape)
del data['Send_Date']
del data['PARTITIONING']
del data['VEHICLE_ID']
del data['All_Fault_in_3_months']


orig = data.copy()
print(orig[:10])
print(data[:10])


iforest = IsolationForest()
iforest.fit(data)
iforest_result = iforest.decision_function(data)

print(iforest_result[:10])

iforest_orig = orig.copy()
iforest_orig['if'] = iforest_result

print(iforest_orig[:10])

iforest_top1000_data=iforest_orig.sort_values(by=['if'],ascending=True)[:1000]

print(iforest_top1000_data[:15])

print(len(iforest_top1000_data[lambda x:x['Class']==1]))

print(iforest_top1000_data['Class'].cumsum().sum())
plt.scatter(range(1000),iforest_top1000_data['Class'].cumsum(),marker='1')
plt.xlabel('Top N data')
plt.ylabel('Anomalies found')
plt.show()