dtreeimpl

# Q1. Use object oriented programming to write Python script for Decision Tree Classification algorithm without using any existing standard class. Create your own Class and functions which can perform the following functionality:
• Function to calculate the entropy
• Function to calculate the Information gain
• Function to find maximum Information gain
"""

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

dataset = pd.read_csv('Iris.csv')
dataset

# Commented out IPython magic to ensure Python compatibility.
# %%latex
#
# Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$

def entropy_calculate(prob_list):

    entropy = 0
    for item in prob_list:
        entropy -= item * np.log2(item)
    return entropy

cases,counts = np.unique(dataset.Species,return_counts=True)
P = [count/len(dataset) for count in counts]
print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))

entropy_entire = entropy_calculate(P)

print('Entire syetems entropy is %.3f bits'%entropy_entire)

cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
print('For SepalLengthCm:')
for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_SepalLengthCm={}
total_entropy_SepalLengthCm=0
for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
    cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
    entropy_SepalLengthCm[case]=entropy_calculate(P)
    total_entropy_SepalLengthCm += entropy_calculate(P)*prob

for case, entropy in entropy_SepalLengthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))

cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
print('For SepalWidthCm:')
for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_SepalWidthCm={}
total_entropy_SepalWidthCm=0
for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
    cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
    entropy_SepalWidthCm[case]=entropy_calculate(P)
    total_entropy_SepalWidthCm += entropy_calculate(P)*prob

for case, entropy in entropy_SepalWidthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))

cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
print('For PetalLengthCm:')
for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_PetalLengthCm={}
total_entropy_PetalLengthCm=0
for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
    cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
    entropy_PetalLengthCm[case]=entropy_calculate(P)
    total_entropy_PetalLengthCm += entropy_calculate(P)*prob

for case, entropy in entropy_PetalLengthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))

cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
print('For PetalWidthCm:')
for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
    print('\tProbabality of %s is %.3f'%(case, prob))

entropy_PetalWidthCm={}
total_entropy_PetalWidthCm=0
for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
    cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
    P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
    entropy_PetalWidthCm[case]=entropy_calculate(P)
    total_entropy_PetalWidthCm += entropy_calculate(P)*prob

for case, entropy in entropy_PetalWidthCm.items():
    print('Entropy for %s is %.2f'%(case,entropy))
print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))

#Training
training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
training_data.head()

category_map ={}
for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
    category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
    training_data[column] = training_data[column].astype('category').cat.codes
training_data.head()

training_data.dropna(inplace=True)
training_data.reset_index(drop=True, inplace=True)
print('Total number of valid records: {}'.format(len(training_data)))

from sklearn.model_selection import train_test_split

X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
y=training_data[['Species']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))

from sklearn.tree import DecisionTreeClassifier

X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
y = y_train[['Species']]

clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')

clf = clf.fit(X, y)

clf

clf.feature_importances_

from sklearn.metrics import accuracy_score

predictions = clf.predict(X_test)

accuracy_score(y_test,predictions)

y_pred = clf.predict(X_test)
print(y_pred)