celestialgod

TT

Jun 16th, 2019
172
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.09 KB | None | 0 0
  1. from math import log
  2. import bisect
  3. from collections import Counter
  4.  
  5. class DiscretizationMDLP(object):
  6.     def __init__(self, x, y):
  7.         # sort first
  8.         self.data = sorted(zip(*[x, y]), key=lambda x: x[0])
  9.         # store keys for binary search
  10.         self.keys = [z[0] for z in self.data]
  11.        
  12.     def _get_entropy(self, y):
  13.         """function to get entropy and unique count of y"""
  14.         # get counter of y
  15.         counter_y = Counter(y)
  16.         # get proportions for each y
  17.         ps = [value/len(y) for _, value in counter_y.items()]
  18.         if len(ps) == 1:
  19.             return 0, len(counter_y.keys())
  20.         else:
  21.             return -sum([p*log(p) for p in ps]), len(counter_y.keys())
  22.  
  23.     def get_cut(self, low, upp):
  24.         """function to get cut point"""
  25.         # get supports to check
  26.         support = sorted(list(set(self.keys[low:upp])))
  27.         # initialize parameters
  28.         n = upp - low
  29.         prev_ent, prev_cut = 9999, None
  30.         entropy1, entropy2, k1, k2, w = None, None, None, None, None
  31.         # get whole entropy
  32.         whole_ent, k = self._get_entropy([y for _, y in self.data[low:upp]])
  33.         for i in range(len(support)-1):
  34.             cut_value = (support[i] + support[i+1])/2
  35.             # get current cut point
  36.             curr_cut = bisect.bisect_right(self.keys[low:upp], cut_value)
  37.             # calculate weight
  38.             weight = curr_cut/n
  39.             # get the entropies of two partitions
  40.             left_ent, left_y_cnt = self._get_entropy([y for _, y in self.data[low:(low+curr_cut)]])
  41.             right_ent, right_y_cnt = self._get_entropy([y for _, y in self.data[(low+curr_cut):upp]])
  42.             # get current entropy
  43.             curr_ent = weight * left_ent + (1-weight) * right_ent
  44.             # compare the entropy to the history
  45.             if curr_ent < prev_ent:
  46.                 prev_ent, prev_cut = curr_ent, curr_cut
  47.                 # keep these variables for calculating the stop criterio
  48.                 entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight
  49.         if prev_cut is None:
  50.             return None, None, None
  51.         else:
  52.             # get entropy gain
  53.             gain = whole_ent - (w * entropy1 + (1-w) * entropy2)
  54.             # get stoping value
  55.             delta = log(pow(3, k) - 2) - (k * whole_ent - k1 * entropy1 - k2 * entropy2)
  56.             # check whether to stop or not
  57.             stop_cut = gain <= 1 / n * (log(n - 1) + delta)
  58.             return prev_ent, low+prev_cut, stop_cut
  59.  
  60.     def get_partition_points(self):
  61.         cut_points = []
  62.         search_intervals = [(0, len(self.data))]
  63.         while len(search_intervals) > 0:
  64.             low, upp = search_intervals.pop()
  65.             entropy, cut, stop_cut = self.get_cut(low, upp)
  66.             if cut is None or stop_cut:
  67.                 continue
  68.             search_intervals.append((low, cut))
  69.             search_intervals.append((cut, upp))
  70.             cut_points.append(cut)
  71.         return sorted([(self.keys[i-1] + self.keys[i])/2 for i in cut_points])
  72.         # return sorted(cut_points)
Advertisement
Add Comment
Please, Sign In to add comment