beegie_b

Naive Bayes

Sep 3rd, 2013
5,277
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.87 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. __author__ = 'Miroslaw Horbal'
  4. __email__ = '[email protected]'
  5. __date__ = '12-06-2013'
  6.  
  7. import numpy as np
  8. import pandas as pd
  9.  
  10. from data_utils import *
  11.  
  12. def create_test_submission(filename, prediction):
  13.     content = ['id,ACTION']
  14.     for i, p in enumerate(prediction):
  15.         content.append('%i,%f' %(i+1,p))
  16.     f = open(filename, 'w')
  17.     f.write('\n'.join(content))
  18.     f.close()
  19.     print 'Saved'
  20.  
  21. def extract_counts(L):
  22.     """
  23.    Take a 1D numpy array as input and return a dict mapping values to counts
  24.    """
  25.     uniques = set(list(L))
  26.     counts = dict((u, np.sum(L==u)) for u in uniques)
  27.     return counts
  28.    
  29. class NaiveBayesClassifier(object):
  30.     """
  31.    Naive Bayes Classifier with additive smoothing
  32.    
  33.    Params
  34.        :alpha - hyperparameter for additive smoothing
  35.    """
  36.     def __init__(self, alpha=1.0):
  37.         self.alpha = alpha
  38.    
  39.     def __repr__(self):
  40.         return 'NaiveBayesClassifier(alpha=%.1e)' % (self.alpha)
  41.    
  42.     def fit(self, X, y):
  43.         """
  44.        Trains Naive Bayes Classifier on data X with labels y
  45.        
  46.        Input
  47.            :X - numpy.array with shape (num_points, num_features)
  48.            :y - numpy.array with shape (num_points, )
  49.        
  50.        Sets attributes
  51.            :pos_prior - estimate of prior probability of label 1
  52.            :neg_prior - estimate of prior probability of label 0
  53.        """
  54.         self._pos_counts = [extract_counts(L) for L in X[y==1].T]
  55.         self._neg_counts = [extract_counts(L) for L in X[y==0].T]
  56.         self._total_pos = float(sum(y==1))
  57.         self._total_neg = float(sum(y==0))
  58.         total = self._total_pos + self._total_neg
  59.         self.pos_prior = self._total_pos / total
  60.         self.neg_prior = self._total_neg / total
  61.  
  62.     def log_transform(self, X):
  63.         """
  64.        Returns log ((P(c=1) / P(c=0)) * prod_i P(x_i | c=1) / P(x_i | c=0))
  65.        using additive smoothing
  66.        
  67.        Input
  68.            :X - numpy.array with shape (num_points, num_features)
  69.                 num_features must be the same as data used to fit model
  70.        """
  71.         m,n = X.shape
  72.         alpha = self.alpha
  73.         tot_neg = self._total_neg
  74.         tot_pos = self._total_pos
  75.         preds = np.zeros((m,n))
  76.         for i, xi in enumerate(X):
  77.             Pxi_neg = np.zeros(n)
  78.             Pxi_pos = np.zeros(n)
  79.             for j, v in enumerate(xi):
  80.                 nc = self._neg_counts[j].get(v,0)
  81.                 pc = self._pos_counts[j].get(v,0)
  82.                 nneg = len(self._neg_counts[j])
  83.                 npos = len(self._pos_counts[j])
  84.                 # Compute probabilities with additive smoothing
  85.                 Pxi_neg[j] = (nc + alpha) / (tot_neg + alpha * nneg)
  86.                 Pxi_pos[j] = (pc + alpha) / (tot_pos + alpha * npos)
  87.             # Compute log pos / neg class ratio
  88.             preds[i,:] = np.log(Pxi_pos) - np.log(Pxi_neg)
  89.         return preds
  90.          
  91.     def log_predict(self, X):
  92.         """
  93.        Returns log ((P(c=1) / P(c=0)) * prod_i P(x_i | c=1) / P(x_i | c=0))
  94.        using additive smoothing
  95.        
  96.        Input
  97.            :X - numpy.array with shape (num_points, num_features)
  98.                 num_features must be the same as data used to fit model
  99.        """
  100.         m,n = X.shape
  101.         if n != len(self._pos_counts):
  102.             raise Error('Dimension mismatch: expected %i features, got %i' % (
  103.                          len(self._pos_counts), n))
  104.         alpha = self.alpha
  105.         tot_neg = self._total_neg
  106.         tot_pos = self._total_pos
  107.         preds = np.zeros(m)
  108.         for i, xi in enumerate(X):
  109.             Pxi_neg = np.zeros(n)
  110.             Pxi_pos = np.zeros(n)
  111.             for j, v in enumerate(xi):
  112.                 nc = self._neg_counts[j].get(v,0)
  113.                 pc = self._pos_counts[j].get(v,0)
  114.                 nneg = len(self._neg_counts[j])
  115.                 npos = len(self._pos_counts[j])
  116.                 # Compute probabilities with additive smoothing
  117.                 Pxi_neg[j] = (nc + alpha) / (tot_neg + alpha * nneg)
  118.                 Pxi_pos[j] = (pc + alpha) / (tot_pos + alpha * npos)
  119.             # Compute log pos / neg class ratio
  120.             preds[i] = np.log(self.pos_prior) + np.sum(np.log(Pxi_pos)) - \
  121.                        np.log(self.neg_prior) - np.sum(np.log(Pxi_neg))
  122.         return preds
  123.  
  124.     def predict(self, X, cutoff=0):
  125.         """
  126.        Returns predicted binary classes for data with decision boundry given
  127.        by cutoff
  128.        
  129.        Input
  130.            :X - see NaiveBayesClassifier.log_predict
  131.            :cutoff - decision boundry for log predictions
  132.        """
  133.         preds = self.log_predict(X)
  134.         return (preds >= cutoff).astype(int)
  135.    
  136. def main(train_file='train.csv', test_file='test.csv', output_file='nb_predict.csv'):
  137.     # Load data
  138.     print 'Loading data...'
  139.     train_data = pd.read_csv(train_file)
  140.     test_data = pd.read_csv(test_file)
  141.     y = np.array(train_data.ACTION)
  142.     X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
  143.     X_test = np.array(test_data.ix[:,1:-1]) # Ignores ID, ROLE_CODE
  144.    
  145.     # Convert features to triples
  146.     print 'Transforming data...'
  147.     X = group_data(X, degree=2)
  148.     X_test = group_data(X_test, degree=2)
  149.     model = NaiveBayesClassifier(alpha=1e-10)
  150.    
  151.     # Train model
  152.     print 'Training Naive Bayes Classifier...'
  153.     #~ model.fit(X, y)
  154.    
  155.     # Make prediction on test set
  156.     print 'Predicting on test set...'
  157.     #~ preds = model.log_predict(X_test)
  158.    
  159.     print 'Writing predictions to %s...' % (output_file)
  160.     #~ create_test_submission(output_file, preds)
  161.  
  162.     return model, X, y, X_test
  163.    
  164. if __name__=='__main__':
  165.     args = { 'train_file':  'train.csv',
  166.              'test_file':   'test.csv',
  167.              'output_file': 'nb_predict.csv' }
  168.     model, X, y, X_test = main(**args)
Advertisement
Add Comment
Please, Sign In to add comment