Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Q1. Use object oriented programming to write Python script for Decision Tree Classification algorithm without using any existing standard class. Create your own Class and functions which can perform the following functionality:
- • Function to calculate the entropy
- • Function to calculate the Information gain
- • Function to find maximum Information gain
- """
- import pandas as pd
- import numpy as np
- import warnings
- import matplotlib.pyplot as plt
- warnings.filterwarnings('ignore')
- dataset = pd.read_csv('Iris.csv')
- dataset
- # Commented out IPython magic to ensure Python compatibility.
- # %%latex
- #
- # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
- def entropy_calculate(prob_list):
- entropy = 0
- for item in prob_list:
- entropy -= item * np.log2(item)
- return entropy
- cases,counts = np.unique(dataset.Species,return_counts=True)
- P = [count/len(dataset) for count in counts]
- print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
- entropy_entire = entropy_calculate(P)
- print('Entire syetems entropy is %.3f bits'%entropy_entire)
- cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
- P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
- print('For SepalLengthCm:')
- for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_SepalLengthCm={}
- total_entropy_SepalLengthCm=0
- for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
- cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
- entropy_SepalLengthCm[case]=entropy_calculate(P)
- total_entropy_SepalLengthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_SepalLengthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))
- cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
- P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
- print('For SepalWidthCm:')
- for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_SepalWidthCm={}
- total_entropy_SepalWidthCm=0
- for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
- cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
- entropy_SepalWidthCm[case]=entropy_calculate(P)
- total_entropy_SepalWidthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_SepalWidthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))
- cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
- P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
- print('For PetalLengthCm:')
- for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_PetalLengthCm={}
- total_entropy_PetalLengthCm=0
- for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
- cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
- entropy_PetalLengthCm[case]=entropy_calculate(P)
- total_entropy_PetalLengthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_PetalLengthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))
- cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
- P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
- print('For PetalWidthCm:')
- for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
- print('\tProbabality of %s is %.3f'%(case, prob))
- entropy_PetalWidthCm={}
- total_entropy_PetalWidthCm=0
- for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
- cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
- P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
- entropy_PetalWidthCm[case]=entropy_calculate(P)
- total_entropy_PetalWidthCm += entropy_calculate(P)*prob
- for case, entropy in entropy_PetalWidthCm.items():
- print('Entropy for %s is %.2f'%(case,entropy))
- print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
- print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))
- #Training
- training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
- training_data.head()
- category_map ={}
- for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
- category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
- training_data[column] = training_data[column].astype('category').cat.codes
- training_data.head()
- training_data.dropna(inplace=True)
- training_data.reset_index(drop=True, inplace=True)
- print('Total number of valid records: {}'.format(len(training_data)))
- from sklearn.model_selection import train_test_split
- X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
- y=training_data[['Species']]
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
- print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))
- from sklearn.tree import DecisionTreeClassifier
- X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
- y = y_train[['Species']]
- clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
- clf = clf.fit(X, y)
- clf
- clf.feature_importances_
- from sklearn.metrics import accuracy_score
- predictions = clf.predict(X_test)
- accuracy_score(y_test,predictions)
- y_pred = clf.predict(X_test)
- print(y_pred)
Advertisement
Add Comment
Please, Sign In to add comment