• API
• FAQ
• Tools
• Archive
SHARE
TWEET

# Untitled

a guest Sep 24th, 2019 87 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. class MyNaiveBayes:
2.
3.     def fit(self, data, labels):
4.         # use this method to learn the model
5.         # if you feel it is easier to calculate priors and likelihoods at the same time
6.         # then feel free to change this method
7.         self.occs = {}
8.         self.priors = self.calculate_priors(labels)
9.         self.likelihoods = self.calculate_likelihoods(data, labels)
10.
11.     def predict(self, data):
12.         # recall: posterior is P(label_i|feature_j)
13.         # hint: Posterior probability is a matrix of size m*n (m samples and n labels)
14.         #       our prediction for each instance in data is the class that
15.         #       has the highest posterior probability.
16.         #       You do not need to normalize your posterior,
17.         #       meaning that for classification, prior and likelihood are enough
18.         #       and there is no need to divide by evidence. Think why!
19.         # return: a list of class labels (predicted)
20.         ##### YOUR PREDICTION CODE STARTS ##### (please do not delete this line)
21.         #raise NotImplementedError("return predicted label for each data row using calculated priors and likelihoods")
22.         outputs = []
23.         failed_calls = 0
24.         total_calls = 0
25.         labels = list(self.priors.index)
26.         feature_names = list(data.columns)
27.         likelihoods = self.likelihoods
28.         for i, row in data.iterrows():
29.             current = row
30.             outcomes = {label: self.priors[label] for label in labels}
31.             for label in labels:
32.                 for feature_name in feature_names:
33.                     total_calls += 1
34.                     try:
35.                         outcomes[label] *= likelihoods[label][feature_name][current[feature_name]]
36.                     except Exception as e:
37.                         failed_calls += 1
38.                         best_match = max(likelihoods[label][feature_name], key=likelihoods[label][feature_name].get)
39.                         outcomes[label] *= likelihoods[label][feature_name][best_match]
40.             predicted_label = max(outcomes.keys(), key=outcomes.get)
41.             outputs.append(predicted_label)
42.         prediction = pd.Series(outputs)
43.         print(f"Failed  {failed_calls}/{total_calls}")
44.         ##### YOUR PREDICTION CODE ENDS ##### (please do not delete this line)
45.
46.         return prediction
47.
48.     def calculate_priors(self, train_labels):
49.         # recall: prior is P(label=l_i)
50.         # hint: store priors in a pandas Series or a list
51.         ##### YOUR PRIORS CODE STARTS ##### (please do not delete this line)
52.         #raise NotImplementedError("return priors in whichever data structure you feel like")
53.         n = len(train_labels)
54.         priors = train_labels.value_counts()
55.         ##### YOUR PRIORS CODE ENDS ##### (please do not delete this line)
56.         print(priors)
57.         return priors.divide(n)
58.
59.     def calculate_likelihoods(self, train_data, train_labels):
60.         # recall: likelihood is P(label=l_i|feature=f_j)
61.         # hint: store likelihoods in a data structure like dictionary:
62.         #        feature_j = [likelihood_k]
63.         #        likelihoods = {label_i: [feature_j]}
64.         #       Where j implies iteration over features, and k implies iteration
65.         #       over different values of feature j. Also, i implies iteration
66.         #       over different values of label. Likelihoods, is then a dictionary
67.         #       that maps different label values to its corresponding likelihoods
68.         #       with respect to feature values (list of lists).
69.         #
70.         #       NB: The above pseudocode is for the purpose of understanding
71.         #           the logic, but it could also be implemented as it is.
72.         #           You are free to use any other data structure
73.         #           or way that is convenient to you!
74.         #
75.         #       More Coding Hints: You are encouraged to use Pandas as much as
76.         #       possible for all these parts as it comes with flexible and
77.         #       convenient indexing features which makes the task easier.
78.         ##### YOUR LIKELIHOODS CODE STARTS ##### (please do not delete this line)
79.         #raise NotImplementedError("return likelihoods in whichever data structure you feel like")
80.         n = len(train_labels)
81.         feature_names = list(train_data.columns)
82.         likelihoods = {label:{col:{} for col in list(train_data.columns)} for label in train_labels.values.unique()}
83.         for i, row in train_data.iterrows():
84.             features = row
85.             label = train_labels[i]
86.             for feature_name in feature_names:
87.                 feature_value = str(features[feature_name])
88.                 if feature_value in likelihoods[label][feature_name]:
89.                     likelihoods[label][feature_name][feature_value] += 1/n
90.                 else:
91.                     likelihoods[label][feature_name][feature_value] = 1/n
92.
93.         ##### YOUR LIKELIHOODS CODE ENDS ##### (please do not delete this line)
94.         return likelihoods
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.

Top