Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.decomposition import PCA
- import math
- from pandas import DataFrame
- import datetime
- from itertools import repeat
- from sklearn.ensemble import IsolationForest
- import seaborn as sns
- '''hbos and isolation forest,
- evaluation , model performance comparison from kaggle'''
- data = pd.read_csv("truck.csv")
- print(data)
- print(data.shape)
- # outliers = data.loc[data['All_Faults_in_3_months']== 0]
- # print(outliers)
- # data = data.drop(labels=['Send_Date','PARTITIONING','VEHICLE_ID','All_Fault_in_3_months'],axis=1)
- # data_columns = data.columns[1:401]
- # print(data_columns)
- # # print(data.columns[1:401])
- #
- # dataa = data.iloc[:,0:400]
- # print(dataa)
- class HBOS:
- def __init__(self, log_scale=True, ranked=False, bin_info_array=[], mode_array=[], nominal_array=[]):
- self.log_scale = log_scale
- self.ranked = ranked
- self.bin_info_array = bin_info_array
- self.mode_array = mode_array
- self.nominal_array = nominal_array # self.histogram_list = []
- def fit(self, data):
- attr_size = len(data.columns)
- total_data_size = len(data)
- # init params if needed
- if len(self.bin_info_array) == 0:
- self.bin_info_array = list(repeat(-1, attr_size))
- if len(self.mode_array) == 0:
- self.mode_array = list(repeat('dynamic binwidth', attr_size))
- if len(self.nominal_array) == 0:
- self.nominal_array = list(repeat(False, attr_size))
- if self.ranked:
- self.log_scale = False
- normal = 1.0
- # calculate standard _bin size if needed
- for i in range(len(self.bin_info_array)):
- if self.bin_info_array[i] == -1:
- self.bin_info_array[i] = round(math.sqrt(len(data)))
- # initialize histogram
- self.histogram_list = []
- for i in range(attr_size):
- self.histogram_list.append([])
- # save maximum value for every attribute(needed to normalize _bin width)
- maximum_value_of_rows = data.apply(max).values
- # sort data
- sorted_data = data.apply(sorted)
- # create histograms
- for attrIndex in range(len(sorted_data.columns)):
- attr = sorted_data.columns[attrIndex]
- last = 0
- bin_start = sorted_data[attr][0]
- if self.mode_array[attrIndex] == 'dynamic binwidth':
- if self.nominal_array[attrIndex] == True:
- while last < len(sorted_data) - 1:
- last = self.create_dynamic_histogram(self.histogram_list, sorted_data, last, 1, attrIndex, True)
- else:
- length = len(sorted_data)
- binwidth = self.bin_info_array[attrIndex]
- while last < len(sorted_data) - 1:
- values_per_bin = math.floor(len(sorted_data) / self.bin_info_array[attrIndex])
- last = self.create_dynamic_histogram(self.histogram_list, sorted_data, last, values_per_bin,
- attrIndex, False)
- if binwidth > 1:
- length = length - self.histogram_list[attrIndex][-1].quantity
- binwidth = binwidth - 1
- else:
- count_bins = 0
- binwidth = (sorted_data[attr][len(sorted_data) - 1] - sorted_data[attr][0]) / self.bin_info_array[
- attrIndex]
- if (self.nominal_array[attrIndex] == True) | (binwidth == 0):
- binwidth = 1
- while last < len(sorted_data):
- is_last_bin = count_bins == self.bin_info_array[attrIndex] - 1
- last = self.create_static_histogram(self.histogram_list, sorted_data, last, binwidth, attrIndex,
- bin_start, is_last_bin)
- bin_start = bin_start + binwidth
- count_bins = count_bins + 1
- # calculate score using normalized _bin width
- # _bin width is normalized to the number of datapoints
- # save maximum score for every attr(needed to normalize score)
- max_score = []
- # loop for all histograms
- for i in range(len(self.histogram_list)):
- max_score.append(0)
- histogram = self.histogram_list[i]
- # loop for all bins
- for k in range(len(histogram)):
- _bin = histogram[k]
- _bin.total_data_size = total_data_size
- _bin.calc_score(maximum_value_of_rows[i])
- if max_score[i] < _bin.score:
- max_score[i] = _bin.score
- for i in range(len(self.histogram_list)):
- histogram = self.histogram_list[i]
- for k in range(len(histogram)):
- _bin = histogram[k]
- _bin.normalize_score(normal, max_score[i], self.log_scale)
- # if ranked
- def predict(self, data):
- score_array = []
- for i in range(len(data)):
- each_data = data.values[i]
- value = 1
- if self.log_scale | self.ranked:
- value = 0
- for attr in range(len(data.columns)):
- score = self.get_score(self.histogram_list[attr], each_data[attr])
- if self.log_scale:
- value = value + score
- elif self.ranked:
- value = value + score
- else:
- value = value * score
- score_array.append(value)
- return score_array
- def fit_predict(self, data):
- self.fit(data)
- return self.predict(data)
- def get_score(self, histogram, value):
- for i in range(len(histogram) - 1):
- _bin = histogram[i]
- if (_bin.range_from <= value) & (value < _bin.range_to):
- return _bin.score
- _bin = histogram[-1]
- if (_bin.range_from <= value) & (value <= _bin.range_to):
- return _bin.score
- return 0
- @staticmethod
- def check_amount(sortedData, first_occurrence, values_per_bin, attr):
- # check if there are more than values_per_bin values of a given value
- if first_occurrence + values_per_bin < len(sortedData):
- if sortedData[attr][first_occurrence] == sortedData[attr][first_occurrence + values_per_bin]:
- return True
- else:
- return False
- else:
- return False
- @staticmethod
- def create_dynamic_histogram(histogram_list, sortedData, first_index, values_per_bin, attrIndex, isNominal):
- last_index = 0
- attr = sortedData.columns[attrIndex]
- # create new _bin
- _bin = HistogramBin(sortedData[attr][first_index], 0, 0)
- # check if an end of the data is near
- if first_index + values_per_bin < len(sortedData):
- last_index = first_index + values_per_bin
- else:
- last_index = len(sortedData)
- # the first value always goes to the _bin
- _bin.add_quantitiy(1)
- # for every other value
- # check if it is the same as the last value
- # if so
- # put it into the _bin
- # if not
- # check if there are more than values_per_bin of that value
- # if so
- # open new _bin
- # if not
- # continue putting the value into the _bin
- cursor = first_index
- for i in range(first_index + 1, last_index):
- if sortedData[attr][i] == sortedData[attr][cursor]:
- _bin.add_quantitiy(1)
- cursor = cursor + 1
- else:
- if HBOS.check_amount(sortedData, i, values_per_bin, attr):
- break
- else:
- _bin.add_quantitiy(1)
- cursor = cursor + 1
- # continue to put values in the _bin until a new values arrive
- for i in range(cursor + 1, len(sortedData)):
- if sortedData[attr][i] == sortedData[attr][cursor]:
- _bin.quantity = _bin.quantity + 1
- cursor = cursor + 1
- else:
- break
- # adjust range of the bins
- if cursor + 1 < len(sortedData):
- _bin.range_to = sortedData[attr][cursor + 1]
- else: # last data
- if isNominal:
- _bin.range_to = sortedData[attr][len(sortedData) - 1] + 1
- else:
- _bin.range_to = sortedData[attr][len(sortedData) - 1]
- # save _bin
- if _bin.range_to - _bin.range_from > 0:
- histogram_list[attrIndex].append(_bin)
- elif len(histogram_list[attrIndex]) == 0:
- _bin.range_to = _bin.range_to + 1
- histogram_list[attrIndex].append(_bin)
- else:
- # if the _bin would have length of zero
- # we merge it with previous _bin
- # this can happen at the end of the histogram
- lastBin = histogram_list[attrIndex][-1]
- lastBin.add_quantitiy(_bin.quantity)
- lastBin.range_to = _bin.range_to
- return cursor + 1
- @staticmethod
- def create_static_histogram(histogram_list, sorted_data, first_index, binwidth, attrIndex, bin_start, last_bin):
- attr = sorted_data.columns[attrIndex]
- _bin = HistogramBin(bin_start, bin_start + binwidth, 0)
- if last_bin == True:
- _bin = HistogramBin(bin_start, sorted_data[attr][len(sorted_data) - 1], 0)
- last = first_index - 1
- cursor = first_index
- while True:
- if cursor >= len(sorted_data):
- break
- if sorted_data[attr][cursor] > _bin.range_to:
- break
- _bin.quantity = _bin.quantity + 1
- last = cursor
- cursor = cursor + 1
- histogram_list[attrIndex].append(_bin)
- return last + 1
- class HistogramBin:
- def __init__(self, range_from, range_to, quantity):
- self.range_from = range_from
- self.range_to = range_to
- self.quantity = quantity
- self.score = 0
- self.total_data_size = 0
- def get_height(self):
- width = self.range_to - self.range_from
- height = self.quantity / width
- return height
- def add_quantitiy(self, anz):
- self.quantity = self.quantity + anz
- def calc_score(self, max_score):
- if max_score == 0:
- max_score = 1
- if self.quantity > 0:
- self.score = self.quantity / ((self.range_to - self.range_from) * self.total_data_size / abs(max_score))
- def normalize_score(self, normal, max_score, log_scale):
- self.score = self.score * normal / max_score
- if (self.score == 0):
- return
- self.score = 1 / self.score
- if log_scale:
- self.score = math.log10(self.score)
- del data['Send_Date']
- del data['PARTITIONING']
- del data['VEHICLE_ID']
- del data['All_Fault_in_3_months']
- orig = data.copy()
- print(orig[:10])
- print(data[:10])
- hbos = HBOS()
- hbos_result = hbos.fit_predict(data)
- print(hbos_result[:10])
- hbos_orig = orig.copy()
- hbos_orig['hbos'] = hbos_result
- print(hbos_orig[:10])
- hbos_top1000_data = hbos_orig.sort_values(by=['hbos'],ascending=False)[:1000]
- print(hbos_top1000_data[:10])
- print(len(hbos_top1000_data[lambda x:x['All_Fault_in_3_months']==1]))
- from matplotlib import pyplot as plt
- print(hbos_top1000_data['All_Fault_in_3_months'].cumsum().sum())
- plt.scatter(range(1000),hbos_top1000_data['All_Fault_in_3_months'].cumsum(),marker='1')
- plt.xlabel('Normal data')
- plt.ylabel('Anomalies found in the data')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement