implementation

# Q1. Use object oriented programming to write Python script for Decision Tree Classification algorithm without using any existing standard class. Create your own Class and functions which can perform the following functionality:
• Function to calculate the entropy
• Function to calculate the Information gain
• Function to find maximum Information gain
"""

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

dataset = pd.read_csv('Iris.csv')
dataset

# Commented out IPython magic to ensure Python compatibility.
# %%latex
#
# Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$

def entropy_calculate(prob_list):

    entropy = 0
    for item in prob_list:
        entropy -= item * np.log2(item)
    return entropy

cases,counts = np.unique(dataset.Species,return_counts=True)
P = [count/len(dataset) for count in counts]
print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))

entropy_entire = entropy_calculate(P)

print('Entire syetems entropy is %.3f bits'%entropy_entire)

cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
print('For SepalLengthCm:')
for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_SepalLengthCm={}
total_entropy_SepalLengthCm=0
for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
    cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
    entropy_SepalLengthCm[case]=entropy_calculate(P)
    total_entropy_SepalLengthCm += entropy_calculate(P)*prob

for case, entropy in entropy_SepalLengthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))

cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
print('For SepalWidthCm:')
for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_SepalWidthCm={}
total_entropy_SepalWidthCm=0
for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
    cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
    entropy_SepalWidthCm[case]=entropy_calculate(P)
    total_entropy_SepalWidthCm += entropy_calculate(P)*prob

for case, entropy in entropy_SepalWidthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))

cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
print('For PetalLengthCm:')
for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_PetalLengthCm={}
total_entropy_PetalLengthCm=0
for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
    cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
    entropy_PetalLengthCm[case]=entropy_calculate(P)
    total_entropy_PetalLengthCm += entropy_calculate(P)*prob

for case, entropy in entropy_PetalLengthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))

cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
print('For PetalWidthCm:')
for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_PetalWidthCm={}
total_entropy_PetalWidthCm=0
for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
    cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
    entropy_PetalWidthCm[case]=entropy_calculate(P)
    total_entropy_PetalWidthCm += entropy_calculate(P)*prob

for case, entropy in entropy_PetalWidthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))

#Training
training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
training_data.head()

category_map ={}
for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
    category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
    training_data[column] = training_data[column].astype('category').cat.codes
training_data.head()

training_data.dropna(inplace=True)
training_data.reset_index(drop=True, inplace=True)
print('Total number of valid records: {}'.format(len(training_data)))

from sklearn.model_selection import train_test_split

X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
y=training_data[['Species']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))

from sklearn.tree import DecisionTreeClassifier

X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
y = y_train[['Species']]

clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')

clf = clf.fit(X, y)

clf

clf.feature_importances_

from sklearn.metrics import accuracy_score

predictions = clf.predict(X_test)

accuracy_score(y_test,predictions)

y_pred = clf.predict(X_test)
print(y_pred)


"""# Q.2 Write Python script to implement Random Forest Classifier. Use standard Class of Python to implement the algorithm by choosing your own dataset.

Data Set Characteristics:
Number of Instances
1797
Number of Attributes
64
Attribute Information
8x8 image of integer pixels in the range 0..16.
Missing Attribute Values
None
The data set contains images of hand-written digits: 10 classes where each class refers to a digit.
"""

# using standard class of python
from collections import Counter

import numpy as np

from sklearn.tree import DecisionTreeClassifier


def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]


def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common


class RandomForest:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree =  DecisionTreeClassifier(min_samples_split=self.min_samples_split,max_depth=self.max_depth,max_features=self.n_feats,)
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)

# Importing relevant libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split

#function for accuracy
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

#loading dataset
data = datasets.load_digits()
X = data.data
y = data.target

#splitting dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

clf = RandomForest(n_trees=3, max_depth=10)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)

print("Accuracy:", acc)


"""# Q3. Write Python script to implement K-Nearest Neighbours. Use standard Class of Python to implement the algorithm by choosing your own dataset."""

#python algorithm for KNN using standard class of python
from collections import Counter

import numpy as np

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]

# Importing relevant libraries
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.model_selection import train_test_split

cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

#function to check accuracy
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

#loading dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

#splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

#checking accuracy
k = 3
clf = KNN(k=k)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print("KNN classification accuracy", accuracy(y_test, predictions))

"""# Q4. Write Python script to implement Naïve Bayes Classifier. Use standard Class of Python to implement the algorithm by choosing your own dataset.


"""

import numpy as np


class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

# Imports
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

iris = datasets.load_iris()
X = iris.data[:,:2]
y= iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy(y_test, predictions))