Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import util
- import math
- class NaiveBayesClassifier(object):
- """
- See the project description for the specifications of the Naive Bayes classifier.
- Note that the variable 'datum' in this code refers to a counter of features
- (not to a raw samples.Datum).
- """
- def __init__(self, legalLabels, smoothing=0, logTransform=False, featureValues=util.Counter()):
- self.legalLabels = legalLabels
- self.type = "naivebayes"
- self.k = int(smoothing) # this is the smoothing parameter, ** use it in your train method **
- self.logTransform = logTransform
- self.featureValues = featureValues # empty if there is no smoothing
- def fit(self, trainingData, trainingLabels):
- """
- Trains the classifier by collecting counts over the training data, and
- stores the Laplace smoothed estimates so that they can be used to classify.
- trainingData is a list of feature dictionaries. The corresponding
- label lists contain the correct label for each instance.
- To get the list of all possible features or labels, use self.features and self.legalLabels.
- """
- self.features = trainingData[0].keys() # the names of the features in the dataset
- self.prior = util.Counter() # probability over labels
- self.conditionalProb = util.Counter() # Conditional probability of feature feat for a given class having value v
- # HINT: could be indexed by (feat, label, value)
- TODO:
- construct (and store) the normalized smoothed priors and conditional probabilities
- for i in range(0, len(trainingLabels)):
- print "labela je: ", trainingLabels[i]
- print trainingLabels
- print trainingData
- datasetLenght = len(trainingLabels)
- for i in range(datasetLenght):
- self.prior[trainingLabels[i]] += 1.0
- # for label in self.prior.keys():
- # print label, ": ", self.prior[label]
- for i in range(datasetLenght):
- for key in self.features:
- value = trainingData[i][key]
- self.conditionalProb[(trainingLabels[i],key, value)] += 1.0
- # for feat, label, value in self.conditionalProb.keys():
- # print feat, " ", label, " ", value, " ", self.conditionalProb[(feat, label, value)]
- "***Sada imamo spremljene brojeve pojavljivanja svih kombinacija, pa racunamo vjerojatnosti***"
- for feat, label, value in self.conditionalProb.keys():
- Nuvjetno = self.conditionalProb[(feat, label, value)]
- Nukupno = self.prior[feat]
- if self.k > 0:
- Nuvjetno += self.k
- Nukupno += len(self.featureValues[label])*self.k
- self.conditionalProb[(feat, label, value)] = Nuvjetno/float(Nukupno)
- for l in self.legalLabels:
- self.prior[l] = self.prior[l] / float(datasetLenght)
- "*** YOUR CODE HERE ***"
- # values = set()
- #
- # # count prior and conditional probs
- # for i, currentLabel in enumerate(trainingLabels):
- # self.prior[currentLabel] += 1.
- # for ftr in self.features:
- # values.add(trainingData[i][ftr])
- # self.conditionalProb[(ftr, currentLabel, trainingData[i][ftr])] += 1.
- #
- # # smoothing
- # for label in self.legalLabels:
- # for feat in self.features:
- # for val in values:
- # self.conditionalProb[(feat, label, val)] += self.k
- #
- # # normalize prior
- # self.prior.normalize()
- #
- # counterPerLabFeat = util.Counter()
- # for label in self.legalLabels:
- # for feat in self.features:
- # for val in values:
- # counterPerLabFeat[(feat, label)] += self.conditionalProb[(feat, label, val)]
- # for key in self.conditionalProb.iterkeys():
- # self.conditionalProb[key] /= counterPerLabFeat[(key[0], key[1])]
- def predict(self, testData):
- """
- Classify the data based on the posterior distribution over labels.
- You shouldn't modify this method.
- """
- guesses = []
- self.posteriors = [] # posterior probabilities are stored for later data analysis.
- for instance in testData:
- if self.logTransform:
- posterior = self.calculateLogJointProbabilities(instance)
- else:
- posterior = self.calculateJointProbabilities(instance)
- guesses.append(posterior.argMax())
- self.posteriors.append(posterior)
- return guesses
- def calculateJointProbabilities(self, instance):
- """
- Returns the joint distribution over legal labels and the instance.
- Each probability should be stored in the joint counter, e.g.
- Joint[3] = <Estimate of ( P(Label = 3, instance) )>
- To get the list of all possible features or labels, use self.features and
- self.legalLabels.
- """
- joint = util.Counter()
- for label in self.legalLabels:
- # calculate the joint probabilities for each class
- "*** YOUR CODE HERE ***"
- joint[label] = self.prior[label]
- for feature in self.features:
- pVal = self.conditionalProb[(label, feature, instance[feature])]
- joint[label] = joint[label] * pVal
- return joint
- def calculateLogJointProbabilities(self, instance):
- """
- Returns the log-joint distribution over legal labels and the instance.
- Each log-probability should be stored in the log-joint counter, e.g.
- logJoint[3] = <Estimate of log( P(Label = 3, instance) )>
- To get the list of all possible features or labels, use self.features and
- self.legalLabels.
- """
- logJoint = util.Counter()
- for label in self.legalLabels:
- #calculate the log joint probabilities for each class
- "*** YOUR CODE HERE ***"
- logJoint[label] = math.log(self.prior[label])
- for feature in self.features:
- pVal = self.conditionalProb[(label, feature, instance[feature])]
- logJoint[label] = logJoint[label] + math.log(pVal)
- return logJoint
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement