1. class MyNaiveBayes:
2.
3.     def fit(self, data, labels):
4.         # use this method to learn the model
5.         # if you feel it is easier to calculate priors and likelihoods at the same time
6.         # then feel free to change this method
7.         self.occs = {}
8.         self.priors = self.calculate_priors(labels)
9.         self.likelihoods = self.calculate_likelihoods(data, labels)
10.
11.     def predict(self, data):
12.         # recall: posterior is P(label_i|feature_j)
13.         # hint: Posterior probability is a matrix of size m*n (m samples and n labels)
14.         #       our prediction for each instance in data is the class that
15.         #       has the highest posterior probability.
16.         #       You do not need to normalize your posterior,
17.         #       meaning that for classification, prior and likelihood are enough
18.         #       and there is no need to divide by evidence. Think why!
19.         # return: a list of class labels (predicted)
20.         ##### YOUR PREDICTION CODE STARTS ##### (please do not delete this line)
21.         #raise NotImplementedError("return predicted label for each data row using calculated priors and likelihoods")
22.         outputs = []
23.         failed_calls = 0
24.         total_calls = 0
25.         labels = list(self.priors.index)
26.         feature_names = list(data.columns)
27.         likelihoods = self.likelihoods
28.         for i, row in data.iterrows():
29.             current = row
30.             outcomes = {label: self.priors[label] for label in labels}
31.             for label in labels:
32.                 for feature_name in feature_names:
33.                     total_calls += 1
34.                     try:
35.                         outcomes[label] *= likelihoods[label][feature_name][current[feature_name]]
36.                     except Exception as e:
37.                         failed_calls += 1
38.                         best_match = max(likelihoods[label][feature_name], key=likelihoods[label][feature_name].get)
39.                         outcomes[label] *= likelihoods[label][feature_name][best_match]
40.             predicted_label = max(outcomes.keys(), key=outcomes.get)
41.             outputs.append(predicted_label)
42.         prediction = pd.Series(outputs)
43.         print(f"Failed  {failed_calls}/{total_calls}")
44.         ##### YOUR PREDICTION CODE ENDS ##### (please do not delete this line)
45.
46.         return prediction
47.
48.     def calculate_priors(self, train_labels):
49.         # recall: prior is P(label=l_i)
50.         # hint: store priors in a pandas Series or a list
51.         ##### YOUR PRIORS CODE STARTS ##### (please do not delete this line)
52.         #raise NotImplementedError("return priors in whichever data structure you feel like")
53.         n = len(train_labels)
54.         priors = train_labels.value_counts()
55.         ##### YOUR PRIORS CODE ENDS ##### (please do not delete this line)
56.         print(priors)
57.         return priors.divide(n)
58.
59.     def calculate_likelihoods(self, train_data, train_labels):
60.         # recall: likelihood is P(label=l_i|feature=f_j)
61.         # hint: store likelihoods in a data structure like dictionary:
62.         #        feature_j = [likelihood_k]
63.         #        likelihoods = {label_i: [feature_j]}
64.         #       Where j implies iteration over features, and k implies iteration
65.         #       over different values of feature j. Also, i implies iteration
66.         #       over different values of label. Likelihoods, is then a dictionary
67.         #       that maps different label values to its corresponding likelihoods
68.         #       with respect to feature values (list of lists).
69.         #
70.         #       NB: The above pseudocode is for the purpose of understanding
71.         #           the logic, but it could also be implemented as it is.
72.         #           You are free to use any other data structure
73.         #           or way that is convenient to you!
74.         #
75.         #       More Coding Hints: You are encouraged to use Pandas as much as
76.         #       possible for all these parts as it comes with flexible and
77.         #       convenient indexing features which makes the task easier.
78.         ##### YOUR LIKELIHOODS CODE STARTS ##### (please do not delete this line)
79.         #raise NotImplementedError("return likelihoods in whichever data structure you feel like")
80.         n = len(train_labels)
81.         feature_names = list(train_data.columns)
82.         likelihoods = {label:{col:{} for col in list(train_data.columns)} for label in train_labels.values.unique()}
83.         for i, row in train_data.iterrows():
84.             features = row
85.             label = train_labels[i]
86.             for feature_name in feature_names:
87.                 feature_value = str(features[feature_name])
88.                 if feature_value in likelihoods[label][feature_name]:
89.                     likelihoods[label][feature_name][feature_value] += 1/n
90.                 else:
91.                     likelihoods[label][feature_name][feature_value] = 1/n
92.
93.         ##### YOUR LIKELIHOODS CODE ENDS ##### (please do not delete this line)
94.         return likelihoods
