Untitled

import util
import math

class NaiveBayesClassifier(object):
    """
    See the project description for the specifications of the Naive Bayes classifier.

    Note that the variable 'datum' in this code refers to a counter of features
    (not to a raw samples.Datum).
    """
    def __init__(self, legalLabels, smoothing=0, logTransform=False, featureValues=util.Counter()):
        self.legalLabels = legalLabels
        self.type = "naivebayes"
        self.k = int(smoothing) # this is the smoothing parameter, ** use it in your train method **
        self.logTransform = logTransform
        self.featureValues = featureValues # empty if there is no smoothing

    def fit(self, trainingData, trainingLabels):
        """
        Trains the classifier by collecting counts over the training data, and
        stores the Laplace smoothed estimates so that they can be used to classify.

        trainingData is a list of feature dictionaries.  The corresponding
        label lists contain the correct label for each instance.

        To get the list of all possible features or labels, use self.features and self.legalLabels.
        """

        self.features = trainingData[0].keys()  # the names of the features in the dataset

        self.prior = util.Counter()  # probability over labels
        self.conditionalProb = util.Counter()  # Conditional probability of feature feat for a given class having value v
                                      # HINT: could be indexed by (feat, label, value)

        TODO:
        construct (and store) the normalized smoothed priors and conditional probabilities
        for i in range(0, len(trainingLabels)):
            print "labela je: ",  trainingLabels[i]
        print trainingLabels
        print trainingData

        datasetLenght = len(trainingLabels)
        for i in range(datasetLenght):
            self.prior[trainingLabels[i]] += 1.0
        # for label in self.prior.keys():
        #     print label, ": ", self.prior[label]
        for i in range(datasetLenght):
            for key in self.features:
                value = trainingData[i][key]
                self.conditionalProb[(trainingLabels[i],key, value)] += 1.0
        # for feat, label, value in self.conditionalProb.keys():
        #     print feat, " ", label, " ", value, " ", self.conditionalProb[(feat, label, value)]
        "***Sada imamo spremljene brojeve pojavljivanja svih kombinacija, pa racunamo vjerojatnosti***"
        for feat, label, value in self.conditionalProb.keys():
            Nuvjetno = self.conditionalProb[(feat, label, value)]
            Nukupno = self.prior[feat]

            if self.k > 0:
                Nuvjetno += self.k
                Nukupno += len(self.featureValues[label])*self.k

            self.conditionalProb[(feat, label, value)] = Nuvjetno/float(Nukupno)

        for l in self.legalLabels:
            self.prior[l] = self.prior[l] / float(datasetLenght)
        "*** YOUR CODE HERE ***"
        # values = set()
        #
        # # count prior and conditional probs
        # for i, currentLabel in enumerate(trainingLabels):
        #     self.prior[currentLabel] += 1.
        #     for ftr in self.features:
        #         values.add(trainingData[i][ftr])
        #         self.conditionalProb[(ftr, currentLabel, trainingData[i][ftr])] += 1.
        #
        # # smoothing
        # for label in self.legalLabels:
        #    for feat in self.features:
        #        for val in values:
        #            self.conditionalProb[(feat, label, val)] += self.k
        #
        # # normalize prior
        # self.prior.normalize()
        #
        # counterPerLabFeat = util.Counter()
        # for label in self.legalLabels:
        #     for feat in self.features:
        #         for val in values:
        #             counterPerLabFeat[(feat, label)] += self.conditionalProb[(feat, label, val)]
        # for key in self.conditionalProb.iterkeys():
        #     self.conditionalProb[key] /= counterPerLabFeat[(key[0], key[1])]


    def predict(self, testData):
        """
        Classify the data based on the posterior distribution over labels.

        You shouldn't modify this method.
        """

        guesses = []
        self.posteriors = [] # posterior probabilities are stored for later data analysis.

        for instance in testData:
            if self.logTransform:
                posterior = self.calculateLogJointProbabilities(instance)
            else:
                posterior = self.calculateJointProbabilities(instance)
            guesses.append(posterior.argMax())
            self.posteriors.append(posterior)
        return guesses


    def calculateJointProbabilities(self, instance):
        """
        Returns the joint distribution over legal labels and the instance.
        Each probability should be stored in the joint counter, e.g.
        Joint[3] = <Estimate of ( P(Label = 3, instance) )>

        To get the list of all possible features or labels, use self.features and
        self.legalLabels.
        """
        joint = util.Counter()

        for label in self.legalLabels:
            # calculate the joint probabilities for each class
            "*** YOUR CODE HERE ***"
            joint[label] = self.prior[label]
            for feature in self.features:
                pVal = self.conditionalProb[(label, feature, instance[feature])]
                joint[label] = joint[label] * pVal
        return joint


    def calculateLogJointProbabilities(self, instance):
        """
        Returns the log-joint distribution over legal labels and the instance.
        Each log-probability should be stored in the log-joint counter, e.g.
        logJoint[3] = <Estimate of log( P(Label = 3, instance) )>

        To get the list of all possible features or labels, use self.features and
        self.legalLabels.
        """
        logJoint = util.Counter()

        for label in self.legalLabels:
            #calculate the log joint probabilities for each class
            "*** YOUR CODE HERE ***"
            logJoint[label] = math.log(self.prior[label])
            for feature in self.features:
                pVal = self.conditionalProb[(label, feature, instance[feature])]
                logJoint[label] = logJoint[label] + math.log(pVal)
        return logJoint