Naive Bayes

#!/usr/bin/env python

__author__ = 'Miroslaw Horbal'
__email__ = '[email protected]'
__date__ = '12-06-2013'

import numpy as np
import pandas as pd

from data_utils import *

def create_test_submission(filename, prediction):
    content = ['id,ACTION']
    for i, p in enumerate(prediction):
        content.append('%i,%f' %(i+1,p))
    f = open(filename, 'w')
    f.write('\n'.join(content))
    f.close()
    print 'Saved'

def extract_counts(L):
    """
    Take a 1D numpy array as input and return a dict mapping values to counts
    """
    uniques = set(list(L))
    counts = dict((u, np.sum(L==u)) for u in uniques)
    return counts

class NaiveBayesClassifier(object):
    """
    Naive Bayes Classifier with additive smoothing

    Params
        :alpha - hyperparameter for additive smoothing
    """
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def __repr__(self):
        return 'NaiveBayesClassifier(alpha=%.1e)' % (self.alpha)

    def fit(self, X, y):
        """
        Trains Naive Bayes Classifier on data X with labels y

        Input
            :X - numpy.array with shape (num_points, num_features)
            :y - numpy.array with shape (num_points, )

        Sets attributes
            :pos_prior - estimate of prior probability of label 1
            :neg_prior - estimate of prior probability of label 0
        """
        self._pos_counts = [extract_counts(L) for L in X[y==1].T]
        self._neg_counts = [extract_counts(L) for L in X[y==0].T]
        self._total_pos = float(sum(y==1))
        self._total_neg = float(sum(y==0))
        total = self._total_pos + self._total_neg
        self.pos_prior = self._total_pos / total
        self.neg_prior = self._total_neg / total

    def log_transform(self, X):
        """
        Returns log ((P(c=1) / P(c=0)) * prod_i P(x_i | c=1) / P(x_i | c=0))
        using additive smoothing

        Input
            :X - numpy.array with shape (num_points, num_features)
                 num_features must be the same as data used to fit model
        """
        m,n = X.shape
        alpha = self.alpha
        tot_neg = self._total_neg
        tot_pos = self._total_pos
        preds = np.zeros((m,n))
        for i, xi in enumerate(X):
            Pxi_neg = np.zeros(n)
            Pxi_pos = np.zeros(n)
            for j, v in enumerate(xi):
                nc = self._neg_counts[j].get(v,0)
                pc = self._pos_counts[j].get(v,0)
                nneg = len(self._neg_counts[j])
                npos = len(self._pos_counts[j])
                # Compute probabilities with additive smoothing
                Pxi_neg[j] = (nc + alpha) / (tot_neg + alpha * nneg)
                Pxi_pos[j] = (pc + alpha) / (tot_pos + alpha * npos)
            # Compute log pos / neg class ratio
            preds[i,:] = np.log(Pxi_pos) - np.log(Pxi_neg)
        return preds

    def log_predict(self, X):
        """
        Returns log ((P(c=1) / P(c=0)) * prod_i P(x_i | c=1) / P(x_i | c=0))
        using additive smoothing

        Input
            :X - numpy.array with shape (num_points, num_features)
                 num_features must be the same as data used to fit model
        """
        m,n = X.shape
        if n != len(self._pos_counts):
            raise Error('Dimension mismatch: expected %i features, got %i' % (
                         len(self._pos_counts), n))
        alpha = self.alpha
        tot_neg = self._total_neg
        tot_pos = self._total_pos
        preds = np.zeros(m)
        for i, xi in enumerate(X):
            Pxi_neg = np.zeros(n)
            Pxi_pos = np.zeros(n)
            for j, v in enumerate(xi):
                nc = self._neg_counts[j].get(v,0)
                pc = self._pos_counts[j].get(v,0)
                nneg = len(self._neg_counts[j])
                npos = len(self._pos_counts[j])
                # Compute probabilities with additive smoothing
                Pxi_neg[j] = (nc + alpha) / (tot_neg + alpha * nneg)
                Pxi_pos[j] = (pc + alpha) / (tot_pos + alpha * npos)
            # Compute log pos / neg class ratio
            preds[i] = np.log(self.pos_prior) + np.sum(np.log(Pxi_pos)) - \
                       np.log(self.neg_prior) - np.sum(np.log(Pxi_neg))
        return preds

    def predict(self, X, cutoff=0):
        """
        Returns predicted binary classes for data with decision boundry given
        by cutoff

        Input
            :X - see NaiveBayesClassifier.log_predict
            :cutoff - decision boundry for log predictions
        """
        preds = self.log_predict(X)
        return (preds >= cutoff).astype(int)

def main(train_file='train.csv', test_file='test.csv', output_file='nb_predict.csv'):
    # Load data
    print 'Loading data...'
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    y = np.array(train_data.ACTION)
    X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
    X_test = np.array(test_data.ix[:,1:-1]) # Ignores ID, ROLE_CODE

    # Convert features to triples
    print 'Transforming data...'
    X = group_data(X, degree=2)
    X_test = group_data(X_test, degree=2)
    model = NaiveBayesClassifier(alpha=1e-10)

    # Train model
    print 'Training Naive Bayes Classifier...'
    #~ model.fit(X, y)

    # Make prediction on test set
    print 'Predicting on test set...'
    #~ preds = model.log_predict(X_test)

    print 'Writing predictions to %s...' % (output_file)
    #~ create_test_submission(output_file, preds)

    return model, X, y, X_test

if __name__=='__main__':
    args = { 'train_file':  'train.csv',
             'test_file':   'test.csv',
             'output_file': 'nb_predict.csv' }
    model, X, y, X_test = main(**args)