CJamie

dtreeimpl

Mar 23rd, 2022
657
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.47 KB | None | 0 0
  1. # Q1. Use object oriented programming to write Python script for Decision Tree Classification algorithm without using any existing standard class. Create your own Class and functions which can perform the following functionality:
  2. • Function to calculate the entropy
  3. • Function to calculate the Information gain
  4. • Function to find maximum Information gain
  5. """
  6.  
  7. import pandas as pd
  8. import numpy as np
  9. import warnings
  10. import matplotlib.pyplot as plt
  11. warnings.filterwarnings('ignore')
  12.  
  13. dataset = pd.read_csv('Iris.csv')
  14. dataset
  15.  
  16. # Commented out IPython magic to ensure Python compatibility.
  17. # %%latex
  18. #
  19. # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
  20.  
  21. def entropy_calculate(prob_list):
  22.  
  23.    entropy = 0
  24.    for item in prob_list:
  25.        entropy -= item * np.log2(item)
  26.    return entropy
  27.  
  28. cases,counts = np.unique(dataset.Species,return_counts=True)
  29. P = [count/len(dataset) for count in counts]
  30. print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
  31.  
  32. entropy_entire = entropy_calculate(P)
  33.  
  34. print('Entire syetems entropy is %.3f bits'%entropy_entire)
  35.  
  36. cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
  37. P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
  38. print('For SepalLengthCm:')
  39. for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
  40.    print('\tProbabality of %s is %.3f'%(case, prob))
  41.  
  42. entropy_SepalLengthCm={}
  43. total_entropy_SepalLengthCm=0
  44. for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
  45.    cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
  46.    P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
  47.    entropy_SepalLengthCm[case]=entropy_calculate(P)
  48.    total_entropy_SepalLengthCm += entropy_calculate(P)*prob
  49.  
  50. for case, entropy in entropy_SepalLengthCm.items():
  51.    print('Entropy for %s is %.2f'%(case,entropy))
  52. print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
  53. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))
  54.  
  55. cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
  56. P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
  57. print('For SepalWidthCm:')
  58. for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
  59.    print('\tProbabality of %s is %.3f'%(case, prob))
  60.  
  61. entropy_SepalWidthCm={}
  62. total_entropy_SepalWidthCm=0
  63. for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
  64.    cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
  65.    P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
  66.    entropy_SepalWidthCm[case]=entropy_calculate(P)
  67.    total_entropy_SepalWidthCm += entropy_calculate(P)*prob
  68.  
  69. for case, entropy in entropy_SepalWidthCm.items():
  70.    print('Entropy for %s is %.2f'%(case,entropy))
  71. print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
  72. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))
  73.  
  74. cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
  75. P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
  76. print('For PetalLengthCm:')
  77. for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
  78.    print('\tProbabality of %s is %.3f'%(case, prob))
  79.  
  80. entropy_PetalLengthCm={}
  81. total_entropy_PetalLengthCm=0
  82. for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
  83.    cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
  84.    P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
  85.    entropy_PetalLengthCm[case]=entropy_calculate(P)
  86.    total_entropy_PetalLengthCm += entropy_calculate(P)*prob
  87.  
  88. for case, entropy in entropy_PetalLengthCm.items():
  89.    print('Entropy for %s is %.2f'%(case,entropy))
  90. print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
  91. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))
  92.  
  93. cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
  94. P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
  95. print('For PetalWidthCm:')
  96. for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
  97.    print('\tProbabality of %s is %.3f'%(case, prob))
  98.  
  99. entropy_PetalWidthCm={}
  100. total_entropy_PetalWidthCm=0
  101. for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
  102.    cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
  103.    P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
  104.    entropy_PetalWidthCm[case]=entropy_calculate(P)
  105.    total_entropy_PetalWidthCm += entropy_calculate(P)*prob
  106.  
  107. for case, entropy in entropy_PetalWidthCm.items():
  108.    print('Entropy for %s is %.2f'%(case,entropy))
  109. print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
  110. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))
  111.  
  112. #Training
  113. training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
  114. training_data.head()
  115.  
  116. category_map ={}
  117. for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
  118.    category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
  119.    training_data[column] = training_data[column].astype('category').cat.codes
  120. training_data.head()
  121.  
  122. training_data.dropna(inplace=True)
  123. training_data.reset_index(drop=True, inplace=True)
  124. print('Total number of valid records: {}'.format(len(training_data)))
  125.  
  126. from sklearn.model_selection import train_test_split
  127.  
  128. X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
  129. y=training_data[['Species']]
  130.  
  131. from sklearn.model_selection import train_test_split
  132. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
  133.  
  134. print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))
  135.  
  136. from sklearn.tree import DecisionTreeClassifier
  137.  
  138. X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
  139. y = y_train[['Species']]
  140.  
  141. clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
  142.  
  143. clf = clf.fit(X, y)
  144.  
  145. clf
  146.  
  147. clf.feature_importances_
  148.  
  149. from sklearn.metrics import accuracy_score
  150.  
  151. predictions = clf.predict(X_test)
  152.  
  153. accuracy_score(y_test,predictions)
  154.  
  155. y_pred = clf.predict(X_test)
  156. print(y_pred)
  157.  
  158.  
Advertisement
Add Comment
Please, Sign In to add comment