Untitled

class MyNaiveBayes:

    def fit(self, data, labels):
        # use this method to learn the model
        # if you feel it is easier to calculate priors and likelihoods at the same time
        # then feel free to change this method
        self.occs = {}
        self.priors = self.calculate_priors(labels)
        self.likelihoods = self.calculate_likelihoods(data, labels)

    def predict(self, data):
        # recall: posterior is P(label_i|feature_j)
        # hint: Posterior probability is a matrix of size m*n (m samples and n labels)
        #       our prediction for each instance in data is the class that
        #       has the highest posterior probability.
        #       You do not need to normalize your posterior,
        #       meaning that for classification, prior and likelihood are enough
        #       and there is no need to divide by evidence. Think why!
        # return: a list of class labels (predicted)
        ##### YOUR PREDICTION CODE STARTS ##### (please do not delete this line)
        #raise NotImplementedError("return predicted label for each data row using calculated priors and likelihoods")
        outputs = []
        failed_calls = 0
        total_calls = 0
        labels = list(self.priors.index)
        feature_names = list(data.columns)
        likelihoods = self.likelihoods
        for i, row in data.iterrows():
            current = row
            outcomes = {label: self.priors[label] for label in labels}
            for label in labels:
                for feature_name in feature_names:
                    total_calls += 1
                    try:
                        outcomes[label] *= likelihoods[label][feature_name][current[feature_name]]
                    except Exception as e:
                        failed_calls += 1
                        best_match = max(likelihoods[label][feature_name], key=likelihoods[label][feature_name].get)
                        outcomes[label] *= likelihoods[label][feature_name][best_match]
            predicted_label = max(outcomes.keys(), key=outcomes.get)
            outputs.append(predicted_label)
        prediction = pd.Series(outputs)
        print(f"Failed  {failed_calls}/{total_calls}")
        ##### YOUR PREDICTION CODE ENDS ##### (please do not delete this line)

        return prediction

    def calculate_priors(self, train_labels):
        # recall: prior is P(label=l_i)
        # hint: store priors in a pandas Series or a list
        ##### YOUR PRIORS CODE STARTS ##### (please do not delete this line)
        #raise NotImplementedError("return priors in whichever data structure you feel like")
        n = len(train_labels)
        priors = train_labels.value_counts()
        ##### YOUR PRIORS CODE ENDS ##### (please do not delete this line)
        print(priors)
        return priors.divide(n)

    def calculate_likelihoods(self, train_data, train_labels):
        # recall: likelihood is P(label=l_i|feature=f_j)
        # hint: store likelihoods in a data structure like dictionary:
        #        feature_j = [likelihood_k]
        #        likelihoods = {label_i: [feature_j]}
        #       Where j implies iteration over features, and k implies iteration
        #       over different values of feature j. Also, i implies iteration
        #       over different values of label. Likelihoods, is then a dictionary
        #       that maps different label values to its corresponding likelihoods
        #       with respect to feature values (list of lists).
        #
        #       NB: The above pseudocode is for the purpose of understanding
        #           the logic, but it could also be implemented as it is.
        #           You are free to use any other data structure
        #           or way that is convenient to you!
        #
        #       More Coding Hints: You are encouraged to use Pandas as much as
        #       possible for all these parts as it comes with flexible and
        #       convenient indexing features which makes the task easier.
        ##### YOUR LIKELIHOODS CODE STARTS ##### (please do not delete this line)
        #raise NotImplementedError("return likelihoods in whichever data structure you feel like")
        n = len(train_labels)
        feature_names = list(train_data.columns)
        likelihoods = {label:{col:{} for col in list(train_data.columns)} for label in train_labels.values.unique()}
        for i, row in train_data.iterrows():
            features = row
            label = train_labels[i]
            for feature_name in feature_names:
                feature_value = str(features[feature_name])
                if feature_value in likelihoods[label][feature_name]:
                    likelihoods[label][feature_name][feature_value] += 1/n
                else:
                    likelihoods[label][feature_name][feature_value] = 1/n

        ##### YOUR LIKELIHOODS CODE ENDS ##### (please do not delete this line)
        return likelihoods