Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class MyNaiveBayes:
- def fit(self, data, labels):
- # use this method to learn the model
- # if you feel it is easier to calculate priors and likelihoods at the same time
- # then feel free to change this method
- self.occs = {}
- self.priors = self.calculate_priors(labels)
- self.likelihoods = self.calculate_likelihoods(data, labels)
- def predict(self, data):
- # recall: posterior is P(label_i|feature_j)
- # hint: Posterior probability is a matrix of size m*n (m samples and n labels)
- # our prediction for each instance in data is the class that
- # has the highest posterior probability.
- # You do not need to normalize your posterior,
- # meaning that for classification, prior and likelihood are enough
- # and there is no need to divide by evidence. Think why!
- # return: a list of class labels (predicted)
- ##### YOUR PREDICTION CODE STARTS ##### (please do not delete this line)
- #raise NotImplementedError("return predicted label for each data row using calculated priors and likelihoods")
- outputs = []
- failed_calls = 0
- total_calls = 0
- labels = list(self.priors.index)
- feature_names = list(data.columns)
- likelihoods = self.likelihoods
- for i, row in data.iterrows():
- current = row
- outcomes = {label: self.priors[label] for label in labels}
- for label in labels:
- for feature_name in feature_names:
- total_calls += 1
- try:
- outcomes[label] *= likelihoods[label][feature_name][current[feature_name]]
- except Exception as e:
- failed_calls += 1
- best_match = max(likelihoods[label][feature_name], key=likelihoods[label][feature_name].get)
- outcomes[label] *= likelihoods[label][feature_name][best_match]
- predicted_label = max(outcomes.keys(), key=outcomes.get)
- outputs.append(predicted_label)
- prediction = pd.Series(outputs)
- print(f"Failed {failed_calls}/{total_calls}")
- ##### YOUR PREDICTION CODE ENDS ##### (please do not delete this line)
- return prediction
- def calculate_priors(self, train_labels):
- # recall: prior is P(label=l_i)
- # hint: store priors in a pandas Series or a list
- ##### YOUR PRIORS CODE STARTS ##### (please do not delete this line)
- #raise NotImplementedError("return priors in whichever data structure you feel like")
- n = len(train_labels)
- priors = train_labels.value_counts()
- ##### YOUR PRIORS CODE ENDS ##### (please do not delete this line)
- print(priors)
- return priors.divide(n)
- def calculate_likelihoods(self, train_data, train_labels):
- # recall: likelihood is P(label=l_i|feature=f_j)
- # hint: store likelihoods in a data structure like dictionary:
- # feature_j = [likelihood_k]
- # likelihoods = {label_i: [feature_j]}
- # Where j implies iteration over features, and k implies iteration
- # over different values of feature j. Also, i implies iteration
- # over different values of label. Likelihoods, is then a dictionary
- # that maps different label values to its corresponding likelihoods
- # with respect to feature values (list of lists).
- #
- # NB: The above pseudocode is for the purpose of understanding
- # the logic, but it could also be implemented as it is.
- # You are free to use any other data structure
- # or way that is convenient to you!
- #
- # More Coding Hints: You are encouraged to use Pandas as much as
- # possible for all these parts as it comes with flexible and
- # convenient indexing features which makes the task easier.
- ##### YOUR LIKELIHOODS CODE STARTS ##### (please do not delete this line)
- #raise NotImplementedError("return likelihoods in whichever data structure you feel like")
- n = len(train_labels)
- feature_names = list(train_data.columns)
- likelihoods = {label:{col:{} for col in list(train_data.columns)} for label in train_labels.values.unique()}
- for i, row in train_data.iterrows():
- features = row
- label = train_labels[i]
- for feature_name in feature_names:
- feature_value = str(features[feature_name])
- if feature_value in likelihoods[label][feature_name]:
- likelihoods[label][feature_name][feature_value] += 1/n
- else:
- likelihoods[label][feature_name][feature_value] = 1/n
- ##### YOUR LIKELIHOODS CODE ENDS ##### (please do not delete this line)
- return likelihoods
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement