Advertisement
Guest User

Untitled

a guest
Sep 24th, 2019
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.98 KB | None | 0 0
  1. class MyNaiveBayes:
  2.    
  3.     def fit(self, data, labels):
  4.         # use this method to learn the model
  5.         # if you feel it is easier to calculate priors and likelihoods at the same time
  6.         # then feel free to change this method
  7.         self.occs = {}
  8.         self.priors = self.calculate_priors(labels)
  9.         self.likelihoods = self.calculate_likelihoods(data, labels)
  10.        
  11.     def predict(self, data):
  12.         # recall: posterior is P(label_i|feature_j)
  13.         # hint: Posterior probability is a matrix of size m*n (m samples and n labels)
  14.         #       our prediction for each instance in data is the class that
  15.         #       has the highest posterior probability.
  16.         #       You do not need to normalize your posterior,
  17.         #       meaning that for classification, prior and likelihood are enough
  18.         #       and there is no need to divide by evidence. Think why!
  19.         # return: a list of class labels (predicted)
  20.         ##### YOUR PREDICTION CODE STARTS ##### (please do not delete this line)
  21.         #raise NotImplementedError("return predicted label for each data row using calculated priors and likelihoods")
  22.         outputs = []
  23.         failed_calls = 0
  24.         total_calls = 0
  25.         labels = list(self.priors.index)
  26.         feature_names = list(data.columns)
  27.         likelihoods = self.likelihoods
  28.         for i, row in data.iterrows():
  29.             current = row
  30.             outcomes = {label: self.priors[label] for label in labels}
  31.             for label in labels:
  32.                 for feature_name in feature_names:
  33.                     total_calls += 1
  34.                     try:
  35.                         outcomes[label] *= likelihoods[label][feature_name][current[feature_name]]
  36.                     except Exception as e:
  37.                         failed_calls += 1
  38.                         best_match = max(likelihoods[label][feature_name], key=likelihoods[label][feature_name].get)
  39.                         outcomes[label] *= likelihoods[label][feature_name][best_match]
  40.             predicted_label = max(outcomes.keys(), key=outcomes.get)
  41.             outputs.append(predicted_label)
  42.         prediction = pd.Series(outputs)
  43.         print(f"Failed  {failed_calls}/{total_calls}")
  44.         ##### YOUR PREDICTION CODE ENDS ##### (please do not delete this line)
  45.        
  46.         return prediction
  47.  
  48.     def calculate_priors(self, train_labels):
  49.         # recall: prior is P(label=l_i)
  50.         # hint: store priors in a pandas Series or a list
  51.         ##### YOUR PRIORS CODE STARTS ##### (please do not delete this line)
  52.         #raise NotImplementedError("return priors in whichever data structure you feel like")
  53.         n = len(train_labels)
  54.         priors = train_labels.value_counts()
  55.         ##### YOUR PRIORS CODE ENDS ##### (please do not delete this line)
  56.         print(priors)
  57.         return priors.divide(n)
  58.  
  59.     def calculate_likelihoods(self, train_data, train_labels):
  60.         # recall: likelihood is P(label=l_i|feature=f_j)
  61.         # hint: store likelihoods in a data structure like dictionary:
  62.         #        feature_j = [likelihood_k]
  63.         #        likelihoods = {label_i: [feature_j]}
  64.         #       Where j implies iteration over features, and k implies iteration  
  65.         #       over different values of feature j. Also, i implies iteration
  66.         #       over different values of label. Likelihoods, is then a dictionary
  67.         #       that maps different label values to its corresponding likelihoods
  68.         #       with respect to feature values (list of lists).
  69.         #
  70.         #       NB: The above pseudocode is for the purpose of understanding
  71.         #           the logic, but it could also be implemented as it is.
  72.         #           You are free to use any other data structure
  73.         #           or way that is convenient to you!
  74.         #
  75.         #       More Coding Hints: You are encouraged to use Pandas as much as
  76.         #       possible for all these parts as it comes with flexible and
  77.         #       convenient indexing features which makes the task easier.
  78.         ##### YOUR LIKELIHOODS CODE STARTS ##### (please do not delete this line)
  79.         #raise NotImplementedError("return likelihoods in whichever data structure you feel like")
  80.         n = len(train_labels)
  81.         feature_names = list(train_data.columns)
  82.         likelihoods = {label:{col:{} for col in list(train_data.columns)} for label in train_labels.values.unique()}
  83.         for i, row in train_data.iterrows():
  84.             features = row
  85.             label = train_labels[i]
  86.             for feature_name in feature_names:
  87.                 feature_value = str(features[feature_name])
  88.                 if feature_value in likelihoods[label][feature_name]:
  89.                     likelihoods[label][feature_name][feature_value] += 1/n
  90.                 else:
  91.                     likelihoods[label][feature_name][feature_value] = 1/n
  92.  
  93.         ##### YOUR LIKELIHOODS CODE ENDS ##### (please do not delete this line)
  94.         return likelihoods
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement