Advertisement
Guest User

woe.py

a guest
Mar 23rd, 2018
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.29 KB | None | 0 0
  1. import numpy as np
  2. import math
  3. from scipy import stats
  4. from sklearn.utils.multiclass import type_of_target
  5.  
  6. class WOE:
  7.     def __init__(self):
  8.         self._WOE_MIN = -20
  9.         self._WOE_MAX = 20
  10.  
  11.     def woe(self, X, y, event=1):
  12.         '''
  13.        Calculate woe of each feature category and information value
  14.        :param X: 2-D numpy array explanatory features which should be discreted already
  15.        :param y: 1-D numpy array target variable which should be binary
  16.        :param event: value of binary stands for the event to predict
  17.        :return: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature
  18.                 numpy array of information value of each feature
  19.        '''
  20.         self.check_target_binary(y)
  21.         X1 = self.feature_discretion(X)
  22.  
  23.         res_woe = []
  24.         res_iv = []
  25.         for i in range(0, X1.shape[-1]):
  26.             if i != 0 and i % 100 == 0:
  27.                 print(i)
  28.             x = X1[:, i]
  29.             woe_dict, iv1 = self.woe_single_x(x, y, event)
  30.             res_woe.append(woe_dict)
  31.             res_iv.append(iv1)
  32.         return np.array(res_woe), np.array(res_iv)
  33.  
  34.     def woe_single_x(self, x, y, event=1):
  35.         '''
  36.        calculate woe and information for a single feature
  37.        :param x: 1-D numpy starnds for single feature
  38.        :param y: 1-D numpy array target variable
  39.        :param event: value of binary stands for the event to predict
  40.        :return: dictionary contains woe values for categories of this feature
  41.                 information value of this feature
  42.        '''
  43.         self.check_target_binary(y)
  44.  
  45.         event_total, non_event_total = self.count_binary(y, event=event)
  46.         x_labels = np.unique(x)
  47.         woe_dict = {}
  48.         iv = 0
  49.         for x1 in x_labels:
  50.             y1 = y[np.where(x == x1)[0]]
  51.             event_count, non_event_count = self.count_binary(y1, event=event)
  52.             rate_event = 1.0 * event_count / event_total
  53.             rate_non_event = 1.0 * non_event_count / non_event_total
  54.             if rate_event == 0:
  55.                 woe1 = self._WOE_MIN
  56.             elif rate_non_event == 0:
  57.                 woe1 = self._WOE_MAX
  58.             else:
  59.                 woe1 = math.log(rate_event / rate_non_event)
  60.             woe_dict[x1] = woe1
  61.             iv += (rate_event - rate_non_event) * woe1
  62.         return woe_dict, iv
  63.  
  64.     def woe_replace(self, X, woe_arr):
  65.         '''
  66.        replace the explanatory feature categories with its woe value
  67.        :param X: 2-D numpy array explanatory features which should be discreted already
  68.        :param woe_arr: numpy array of woe dictionaries, each dictionary contains woe values for categories of each feature
  69.        :return: the new numpy array in which woe values filled
  70.        '''
  71.         if X.shape[-1] != woe_arr.shape[-1]:
  72.             raise ValueError('WOE dict array length must be equal with features length')
  73.  
  74.         res = np.copy(X).astype(float)
  75.         idx = 0
  76.         for woe_dict in woe_arr:
  77.             for k in woe_dict.keys():
  78.                 woe = woe_dict[k]
  79.                 res[:, idx][np.where(res[:, idx] == k)[0]] = woe * 1.0
  80.             idx += 1
  81.  
  82.         return res
  83.  
  84.     def combined_iv(self, X, y, masks, event=1):
  85.         '''
  86.        calcute the information vlaue of combination features
  87.        :param X: 2-D numpy array explanatory features which should be discreted already
  88.        :param y: 1-D numpy array target variable
  89.        :param masks: 1-D numpy array of masks stands for which features are included in combination,
  90.                      e.g. np.array([0,0,1,1,1,0,0,0,0,0,1]), the length should be same as features length
  91.        :param event: value of binary stands for the event to predict
  92.        :return: woe dictionary and information value of combined features
  93.        '''
  94.         if masks.shape[-1] != X.shape[-1]:
  95.             raise ValueError('Masks array length must be equal with features length')
  96.  
  97.         x = X[:, np.where(masks == 1)[0]]
  98.         tmp = []
  99.         for i in range(x.shape[0]):
  100.             tmp.append(self.combine(x[i, :]))
  101.  
  102.         dumy = np.array(tmp)
  103.         # dumy_labels = np.unique(dumy)
  104.         woe, iv = self.woe_single_x(dumy, y, event)
  105.         return woe, iv
  106.  
  107.     def combine(self, list):
  108.         res = ''
  109.         for item in list:
  110.             res += str(item)
  111.         return res
  112.  
  113.     def count_binary(self, a, event=1):
  114.         event_count = (a == event).sum()
  115.         non_event_count = a.shape[-1] - event_count
  116.         return event_count, non_event_count
  117.  
  118.     def check_target_binary(self, y):
  119.         '''
  120.        check if the target variable is binary, raise error if not.
  121.        :param y:
  122.        :return:
  123.        '''
  124.         y_type = type_of_target(y)
  125.         if y_type not in ['binary']:
  126.             raise ValueError('Label type must be binary')
  127.  
  128.     def feature_discretion(self, X):
  129.         '''
  130.        Discrete the continuous features of input data X, and keep other features unchanged.
  131.        :param X : numpy array
  132.        :return: the numpy array in which all continuous features are discreted
  133.        '''
  134.         temp = []
  135.         for i in range(0, X.shape[-1]):
  136.             x = X[:, i]
  137.             x_type = type_of_target(x)
  138.             if x_type == 'continuous':
  139.                 x1 = self.discrete(x)
  140.                 temp.append(x1)
  141.             else:
  142.                 temp.append(x)
  143.         return np.array(temp).T
  144.  
  145.     def discrete(self, x):
  146.         '''
  147.        Discrete the input 1-D numpy array using 5 equal percentiles
  148.        :param x: 1-D numpy array
  149.        :return: discreted 1-D numpy array
  150.        '''
  151.         res = np.array([0] * x.shape[-1], dtype=int)
  152.         for i in range(5):
  153.             point1 = stats.scoreatpercentile(x, i * 20)
  154.             point2 = stats.scoreatpercentile(x, (i + 1) * 20)
  155.             x1 = x[np.where((x >= point1) & (x <= point2))]
  156.             mask = np.in1d(x, x1)
  157.             res[mask] = (i + 1)
  158.         return res
  159.  
  160.     @property
  161.     def WOE_MIN(self):
  162.         return self._WOE_MIN
  163.     @WOE_MIN.setter
  164.     def WOE_MIN(self, woe_min):
  165.         self._WOE_MIN = woe_min
  166.     @property
  167.     def WOE_MAX(self):
  168.         return self._WOE_MAX
  169.     @WOE_MAX.setter
  170.     def WOE_MAX(self, woe_max):
  171.         self._WOE_MAX = woe_max
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement