SHARE
TWEET

Untitled

a guest Sep 24th, 2019 87 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. class MyNaiveBayes:
  2.    
  3.     def fit(self, data, labels):
  4.         # use this method to learn the model
  5.         # if you feel it is easier to calculate priors and likelihoods at the same time
  6.         # then feel free to change this method
  7.         self.occs = {}
  8.         self.priors = self.calculate_priors(labels)
  9.         self.likelihoods = self.calculate_likelihoods(data, labels)
  10.        
  11.     def predict(self, data):
  12.         # recall: posterior is P(label_i|feature_j)
  13.         # hint: Posterior probability is a matrix of size m*n (m samples and n labels)
  14.         #       our prediction for each instance in data is the class that
  15.         #       has the highest posterior probability.
  16.         #       You do not need to normalize your posterior,
  17.         #       meaning that for classification, prior and likelihood are enough
  18.         #       and there is no need to divide by evidence. Think why!
  19.         # return: a list of class labels (predicted)
  20.         ##### YOUR PREDICTION CODE STARTS ##### (please do not delete this line)
  21.         #raise NotImplementedError("return predicted label for each data row using calculated priors and likelihoods")
  22.         outputs = []
  23.         failed_calls = 0
  24.         total_calls = 0
  25.         labels = list(self.priors.index)
  26.         feature_names = list(data.columns)
  27.         likelihoods = self.likelihoods
  28.         for i, row in data.iterrows():
  29.             current = row
  30.             outcomes = {label: self.priors[label] for label in labels}
  31.             for label in labels:
  32.                 for feature_name in feature_names:
  33.                     total_calls += 1
  34.                     try:
  35.                         outcomes[label] *= likelihoods[label][feature_name][current[feature_name]]
  36.                     except Exception as e:
  37.                         failed_calls += 1
  38.                         best_match = max(likelihoods[label][feature_name], key=likelihoods[label][feature_name].get)
  39.                         outcomes[label] *= likelihoods[label][feature_name][best_match]
  40.             predicted_label = max(outcomes.keys(), key=outcomes.get)
  41.             outputs.append(predicted_label)
  42.         prediction = pd.Series(outputs)
  43.         print(f"Failed  {failed_calls}/{total_calls}")
  44.         ##### YOUR PREDICTION CODE ENDS ##### (please do not delete this line)
  45.        
  46.         return prediction
  47.  
  48.     def calculate_priors(self, train_labels):
  49.         # recall: prior is P(label=l_i)
  50.         # hint: store priors in a pandas Series or a list
  51.         ##### YOUR PRIORS CODE STARTS ##### (please do not delete this line)
  52.         #raise NotImplementedError("return priors in whichever data structure you feel like")
  53.         n = len(train_labels)
  54.         priors = train_labels.value_counts()
  55.         ##### YOUR PRIORS CODE ENDS ##### (please do not delete this line)
  56.         print(priors)
  57.         return priors.divide(n)
  58.  
  59.     def calculate_likelihoods(self, train_data, train_labels):
  60.         # recall: likelihood is P(label=l_i|feature=f_j)
  61.         # hint: store likelihoods in a data structure like dictionary:
  62.         #        feature_j = [likelihood_k]
  63.         #        likelihoods = {label_i: [feature_j]}
  64.         #       Where j implies iteration over features, and k implies iteration  
  65.         #       over different values of feature j. Also, i implies iteration
  66.         #       over different values of label. Likelihoods, is then a dictionary
  67.         #       that maps different label values to its corresponding likelihoods
  68.         #       with respect to feature values (list of lists).
  69.         #
  70.         #       NB: The above pseudocode is for the purpose of understanding
  71.         #           the logic, but it could also be implemented as it is.
  72.         #           You are free to use any other data structure
  73.         #           or way that is convenient to you!
  74.         #
  75.         #       More Coding Hints: You are encouraged to use Pandas as much as
  76.         #       possible for all these parts as it comes with flexible and
  77.         #       convenient indexing features which makes the task easier.
  78.         ##### YOUR LIKELIHOODS CODE STARTS ##### (please do not delete this line)
  79.         #raise NotImplementedError("return likelihoods in whichever data structure you feel like")
  80.         n = len(train_labels)
  81.         feature_names = list(train_data.columns)
  82.         likelihoods = {label:{col:{} for col in list(train_data.columns)} for label in train_labels.values.unique()}
  83.         for i, row in train_data.iterrows():
  84.             features = row
  85.             label = train_labels[i]
  86.             for feature_name in feature_names:
  87.                 feature_value = str(features[feature_name])
  88.                 if feature_value in likelihoods[label][feature_name]:
  89.                     likelihoods[label][feature_name][feature_value] += 1/n
  90.                 else:
  91.                     likelihoods[label][feature_name][feature_value] = 1/n
  92.  
  93.         ##### YOUR LIKELIHOODS CODE ENDS ##### (please do not delete this line)
  94.         return likelihoods
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top