Advertisement
Guest User

Untitled

a guest
May 26th, 2019
138
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.75 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.decomposition import PCA
  4. import math
  5. from pandas import DataFrame
  6. import datetime
  7. from itertools import repeat
  8. from sklearn.ensemble import IsolationForest
  9. import seaborn as sns
  10.  
  11. '''hbos and isolation forest,
  12. evaluation , model performance comparison from kaggle'''
  13.  
  14. data = pd.read_csv("truck.csv")
  15. print(data)
  16. print(data.shape)
  17. # outliers = data.loc[data['All_Faults_in_3_months']== 0]
  18. # print(outliers)
  19. # data = data.drop(labels=['Send_Date','PARTITIONING','VEHICLE_ID','All_Fault_in_3_months'],axis=1)
  20. # data_columns = data.columns[1:401]
  21. # print(data_columns)
  22. # # print(data.columns[1:401])
  23. #
  24. # dataa = data.iloc[:,0:400]
  25. # print(dataa)
  26.  
  27.  
  28.  
  29.  
  30. class HBOS:
  31.  
  32. def __init__(self, log_scale=True, ranked=False, bin_info_array=[], mode_array=[], nominal_array=[]):
  33. self.log_scale = log_scale
  34. self.ranked = ranked
  35. self.bin_info_array = bin_info_array
  36. self.mode_array = mode_array
  37. self.nominal_array = nominal_array # self.histogram_list = []
  38.  
  39. def fit(self, data):
  40. attr_size = len(data.columns)
  41. total_data_size = len(data)
  42.  
  43. # init params if needed
  44. if len(self.bin_info_array) == 0:
  45. self.bin_info_array = list(repeat(-1, attr_size))
  46.  
  47. if len(self.mode_array) == 0:
  48. self.mode_array = list(repeat('dynamic binwidth', attr_size))
  49.  
  50. if len(self.nominal_array) == 0:
  51. self.nominal_array = list(repeat(False, attr_size))
  52.  
  53. if self.ranked:
  54. self.log_scale = False
  55.  
  56. normal = 1.0
  57.  
  58. # calculate standard _bin size if needed
  59. for i in range(len(self.bin_info_array)):
  60. if self.bin_info_array[i] == -1:
  61. self.bin_info_array[i] = round(math.sqrt(len(data)))
  62.  
  63. # initialize histogram
  64. self.histogram_list = []
  65. for i in range(attr_size):
  66. self.histogram_list.append([])
  67.  
  68. # save maximum value for every attribute(needed to normalize _bin width)
  69. maximum_value_of_rows = data.apply(max).values
  70.  
  71. # sort data
  72. sorted_data = data.apply(sorted)
  73.  
  74. # create histograms
  75. for attrIndex in range(len(sorted_data.columns)):
  76. attr = sorted_data.columns[attrIndex]
  77. last = 0
  78. bin_start = sorted_data[attr][0]
  79. if self.mode_array[attrIndex] == 'dynamic binwidth':
  80. if self.nominal_array[attrIndex] == True:
  81. while last < len(sorted_data) - 1:
  82. last = self.create_dynamic_histogram(self.histogram_list, sorted_data, last, 1, attrIndex, True)
  83. else:
  84. length = len(sorted_data)
  85. binwidth = self.bin_info_array[attrIndex]
  86. while last < len(sorted_data) - 1:
  87. values_per_bin = math.floor(len(sorted_data) / self.bin_info_array[attrIndex])
  88. last = self.create_dynamic_histogram(self.histogram_list, sorted_data, last, values_per_bin,
  89. attrIndex, False)
  90. if binwidth > 1:
  91. length = length - self.histogram_list[attrIndex][-1].quantity
  92. binwidth = binwidth - 1
  93. else:
  94. count_bins = 0
  95. binwidth = (sorted_data[attr][len(sorted_data) - 1] - sorted_data[attr][0]) / self.bin_info_array[
  96. attrIndex]
  97. if (self.nominal_array[attrIndex] == True) | (binwidth == 0):
  98. binwidth = 1
  99. while last < len(sorted_data):
  100. is_last_bin = count_bins == self.bin_info_array[attrIndex] - 1
  101. last = self.create_static_histogram(self.histogram_list, sorted_data, last, binwidth, attrIndex,
  102. bin_start, is_last_bin)
  103. bin_start = bin_start + binwidth
  104. count_bins = count_bins + 1
  105.  
  106. # calculate score using normalized _bin width
  107. # _bin width is normalized to the number of datapoints
  108. # save maximum score for every attr(needed to normalize score)
  109. max_score = []
  110.  
  111. # loop for all histograms
  112. for i in range(len(self.histogram_list)):
  113. max_score.append(0)
  114. histogram = self.histogram_list[i]
  115.  
  116. # loop for all bins
  117. for k in range(len(histogram)):
  118. _bin = histogram[k]
  119. _bin.total_data_size = total_data_size
  120. _bin.calc_score(maximum_value_of_rows[i])
  121. if max_score[i] < _bin.score:
  122. max_score[i] = _bin.score
  123.  
  124. for i in range(len(self.histogram_list)):
  125. histogram = self.histogram_list[i]
  126. for k in range(len(histogram)):
  127. _bin = histogram[k]
  128. _bin.normalize_score(normal, max_score[i], self.log_scale)
  129.  
  130. # if ranked
  131.  
  132. def predict(self, data):
  133. score_array = []
  134. for i in range(len(data)):
  135. each_data = data.values[i]
  136. value = 1
  137. if self.log_scale | self.ranked:
  138. value = 0
  139. for attr in range(len(data.columns)):
  140. score = self.get_score(self.histogram_list[attr], each_data[attr])
  141. if self.log_scale:
  142. value = value + score
  143. elif self.ranked:
  144. value = value + score
  145. else:
  146. value = value * score
  147. score_array.append(value)
  148. return score_array
  149.  
  150. def fit_predict(self, data):
  151. self.fit(data)
  152. return self.predict(data)
  153.  
  154. def get_score(self, histogram, value):
  155. for i in range(len(histogram) - 1):
  156. _bin = histogram[i]
  157. if (_bin.range_from <= value) & (value < _bin.range_to):
  158. return _bin.score
  159.  
  160. _bin = histogram[-1]
  161. if (_bin.range_from <= value) & (value <= _bin.range_to):
  162. return _bin.score
  163. return 0
  164.  
  165. @staticmethod
  166. def check_amount(sortedData, first_occurrence, values_per_bin, attr):
  167. # check if there are more than values_per_bin values of a given value
  168. if first_occurrence + values_per_bin < len(sortedData):
  169. if sortedData[attr][first_occurrence] == sortedData[attr][first_occurrence + values_per_bin]:
  170. return True
  171. else:
  172. return False
  173. else:
  174. return False
  175.  
  176. @staticmethod
  177. def create_dynamic_histogram(histogram_list, sortedData, first_index, values_per_bin, attrIndex, isNominal):
  178. last_index = 0
  179. attr = sortedData.columns[attrIndex]
  180.  
  181. # create new _bin
  182. _bin = HistogramBin(sortedData[attr][first_index], 0, 0)
  183.  
  184. # check if an end of the data is near
  185. if first_index + values_per_bin < len(sortedData):
  186. last_index = first_index + values_per_bin
  187. else:
  188. last_index = len(sortedData)
  189.  
  190. # the first value always goes to the _bin
  191. _bin.add_quantitiy(1)
  192.  
  193. # for every other value
  194. # check if it is the same as the last value
  195. # if so
  196. # put it into the _bin
  197. # if not
  198. # check if there are more than values_per_bin of that value
  199. # if so
  200. # open new _bin
  201. # if not
  202. # continue putting the value into the _bin
  203.  
  204. cursor = first_index
  205. for i in range(first_index + 1, last_index):
  206. if sortedData[attr][i] == sortedData[attr][cursor]:
  207. _bin.add_quantitiy(1)
  208. cursor = cursor + 1
  209. else:
  210. if HBOS.check_amount(sortedData, i, values_per_bin, attr):
  211. break
  212. else:
  213. _bin.add_quantitiy(1)
  214. cursor = cursor + 1
  215.  
  216. # continue to put values in the _bin until a new values arrive
  217. for i in range(cursor + 1, len(sortedData)):
  218. if sortedData[attr][i] == sortedData[attr][cursor]:
  219. _bin.quantity = _bin.quantity + 1
  220. cursor = cursor + 1
  221. else:
  222. break
  223.  
  224. # adjust range of the bins
  225. if cursor + 1 < len(sortedData):
  226. _bin.range_to = sortedData[attr][cursor + 1]
  227. else: # last data
  228. if isNominal:
  229. _bin.range_to = sortedData[attr][len(sortedData) - 1] + 1
  230. else:
  231. _bin.range_to = sortedData[attr][len(sortedData) - 1]
  232.  
  233. # save _bin
  234. if _bin.range_to - _bin.range_from > 0:
  235. histogram_list[attrIndex].append(_bin)
  236. elif len(histogram_list[attrIndex]) == 0:
  237. _bin.range_to = _bin.range_to + 1
  238. histogram_list[attrIndex].append(_bin)
  239. else:
  240. # if the _bin would have length of zero
  241. # we merge it with previous _bin
  242. # this can happen at the end of the histogram
  243. lastBin = histogram_list[attrIndex][-1]
  244. lastBin.add_quantitiy(_bin.quantity)
  245. lastBin.range_to = _bin.range_to
  246.  
  247. return cursor + 1
  248.  
  249. @staticmethod
  250. def create_static_histogram(histogram_list, sorted_data, first_index, binwidth, attrIndex, bin_start, last_bin):
  251. attr = sorted_data.columns[attrIndex]
  252. _bin = HistogramBin(bin_start, bin_start + binwidth, 0)
  253. if last_bin == True:
  254. _bin = HistogramBin(bin_start, sorted_data[attr][len(sorted_data) - 1], 0)
  255.  
  256. last = first_index - 1
  257. cursor = first_index
  258.  
  259. while True:
  260. if cursor >= len(sorted_data):
  261. break
  262. if sorted_data[attr][cursor] > _bin.range_to:
  263. break
  264. _bin.quantity = _bin.quantity + 1
  265. last = cursor
  266. cursor = cursor + 1
  267.  
  268. histogram_list[attrIndex].append(_bin)
  269. return last + 1
  270.  
  271.  
  272. class HistogramBin:
  273.  
  274. def __init__(self, range_from, range_to, quantity):
  275. self.range_from = range_from
  276. self.range_to = range_to
  277. self.quantity = quantity
  278. self.score = 0
  279. self.total_data_size = 0
  280.  
  281. def get_height(self):
  282. width = self.range_to - self.range_from
  283. height = self.quantity / width
  284. return height
  285.  
  286. def add_quantitiy(self, anz):
  287. self.quantity = self.quantity + anz
  288.  
  289. def calc_score(self, max_score):
  290. if max_score == 0:
  291. max_score = 1
  292.  
  293. if self.quantity > 0:
  294. self.score = self.quantity / ((self.range_to - self.range_from) * self.total_data_size / abs(max_score))
  295.  
  296. def normalize_score(self, normal, max_score, log_scale):
  297. self.score = self.score * normal / max_score
  298. if (self.score == 0):
  299. return
  300. self.score = 1 / self.score
  301. if log_scale:
  302. self.score = math.log10(self.score)
  303.  
  304.  
  305. del data['Send_Date']
  306. del data['PARTITIONING']
  307. del data['VEHICLE_ID']
  308. del data['All_Fault_in_3_months']
  309.  
  310.  
  311.  
  312. orig = data.copy()
  313. print(orig[:10])
  314. print(data[:10])
  315.  
  316. hbos = HBOS()
  317. hbos_result = hbos.fit_predict(data)
  318.  
  319. print(hbos_result[:10])
  320. hbos_orig = orig.copy()
  321. hbos_orig['hbos'] = hbos_result
  322. print(hbos_orig[:10])
  323.  
  324. hbos_top1000_data = hbos_orig.sort_values(by=['hbos'],ascending=False)[:1000]
  325. print(hbos_top1000_data[:10])
  326.  
  327. print(len(hbos_top1000_data[lambda x:x['All_Fault_in_3_months']==1]))
  328. from matplotlib import pyplot as plt
  329. print(hbos_top1000_data['All_Fault_in_3_months'].cumsum().sum())
  330. plt.scatter(range(1000),hbos_top1000_data['All_Fault_in_3_months'].cumsum(),marker='1')
  331. plt.xlabel('Normal data')
  332. plt.ylabel('Anomalies found in the data')
  333. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement