Advertisement
makispaiktis

ML - Lab 2 - Tree Classification

Oct 18th, 2022 (edited)
1,242
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.18 KB | None | 0 0
  1. import math
  2. import numpy as np
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. from sklearn import tree
  6. from sklearn.preprocessing import OneHotEncoder
  7.  
  8. print("********************************")
  9. print("Entropy - Information Gain")
  10. print("********************************")
  11. weather = pd.read_csv("./weather.txt")
  12. names = ["Outlook", "Temperature", "Humidity"]
  13. print("DataFrame: ")
  14. print(weather)
  15. print()
  16.  
  17.  
  18.  
  19. # ************************************************************************
  20. # ************************************************************************
  21. # GINI Index
  22. print("********************************")
  23. print("GINI Index")
  24. print("********************************")
  25. # ************************************************************************
  26. # ************************************************************************
  27. # GINI Index - Outlook
  28. absfreq = pd.crosstab(weather.Outlook, weather.Play)
  29. freq = pd.crosstab(weather.Outlook, weather.Play, normalize='index') # Contains the indices
  30. freqSum = pd.crosstab(weather.Outlook, weather.Play, normalize='all').sum(axis=1)
  31. print("absfreq = ")
  32. print(absfreq)
  33. print()
  34. print("freq = ")
  35. print(freq)
  36. print()
  37. print("freqSum = ")
  38. print(freqSum)
  39. print()
  40.  
  41. GINI_Sunny = 1 - freq.loc["Sunny", "No"]**2 - freq.loc["Sunny", "Yes"]**2
  42. GINI_Rainy = 1 - freq.loc["Rainy", "No"]**2 - freq.loc["Rainy", "Yes"]**2
  43. GINI_Outlook = freqSum.loc["Sunny"] * GINI_Sunny + freqSum["Rainy"] * GINI_Rainy
  44. print("GINI_Sunny = " + str(GINI_Sunny))
  45. print("GINI_Rainy = " + str(GINI_Rainy))
  46. print("GINI_Outlook = " + str(GINI_Outlook))
  47. print()
  48.  
  49. # GINI Index - Temperature
  50. abstemp = pd.crosstab(weather.Temperature, weather.Play)
  51. temp = pd.crosstab(weather.Temperature, weather.Play, normalize='index') # Contains the indices
  52. tempSum = pd.crosstab(weather.Temperature, weather.Play, normalize='all').sum(axis=1)
  53. print("abstemp = ")
  54. print(abstemp)
  55. print()
  56. print("temp = ")
  57. print(temp)
  58. print()
  59. print("tempSum = ")
  60. print(tempSum)
  61. print()
  62.  
  63. GINI_Hot = 1 - temp.loc["Hot", "No"]**2 - temp.loc["Hot", "Yes"]**2
  64. GINI_Cool = 1 - temp.loc["Cool", "No"]**2 - temp.loc["Cool", "Yes"]**2
  65. GINI_Temperature = tempSum.loc["Hot"] * GINI_Hot + tempSum["Cool"] * GINI_Cool
  66. print("GINI_Hot = " + str(GINI_Hot))
  67. print("GINI_Cool = " + str(GINI_Cool))
  68. print("GINI_Temperature = " + str(GINI_Temperature))
  69. print()
  70.  
  71. # GINI Index - Humidity
  72. abshum = pd.crosstab(weather.Humidity, weather.Play)
  73. hum = pd.crosstab(weather.Humidity, weather.Play, normalize='index') # Contains the indices
  74. humSum = pd.crosstab(weather.Humidity, weather.Play, normalize='all').sum(axis=1)
  75. print("abshum = ")
  76. print(abshum)
  77. print()
  78. print("hum = ")
  79. print(hum)
  80. print()
  81. print("humSum = ")
  82. print(humSum)
  83. print()
  84.  
  85. GINI_High = 1 - hum.loc["High", "No"]**2 - hum.loc["High", "Yes"]**2
  86. GINI_Low = 1 - hum.loc["Low", "No"]**2 - hum.loc["Low", "Yes"]**2
  87. GINI_Humidity = humSum.loc["High"] * GINI_High + humSum["Low"] * GINI_Low
  88. print("GINI_High = " + str(GINI_High))
  89. print("GINI_Low = " + str(GINI_Low))
  90. print("GINI_Humidity = " + str(GINI_Humidity))
  91. print()
  92.  
  93.  
  94. GINIs = [GINI_Outlook, GINI_Temperature, GINI_Humidity]
  95. GINIs_df = pd.DataFrame(GINIs, names)
  96. print(GINIs_df)
  97. print()
  98. MIN = min(GINIs)
  99. MIN_INDEX = GINIs.index(MIN)
  100. print("The most appropriate feature for classification with GINI is '" + names[MIN_INDEX] + "' with GINI = " + str(MIN))
  101. print()
  102. print()
  103. print()
  104.  
  105.  
  106.  
  107.  
  108. # ************************************************************************
  109. # ************************************************************************
  110. # Entropy - Information Gain
  111. # ************************************************************************
  112. # ************************************************************************
  113. print("********************************")
  114. print("Entropy - Information Gain")
  115. print("********************************")
  116. # First, I have to find the total entropy
  117. freq_tot = pd.crosstab("Play", weather.Play, normalize="index")
  118. print(freq_tot)
  119. Entropy_All = - freq_tot.No * math.log2(freq_tot.No) - freq_tot.Yes * math.log2(freq_tot.Yes)
  120. Entropy_All = Entropy_All['Play']
  121. print("Entropy_All = " + str(Entropy_All))
  122. print()
  123.  
  124. # Entropy - Outlook
  125. Entropy_Sunny = - freq.loc['Sunny', 'No'] * math.log2(freq.loc['Sunny', 'No']) - freq.loc['Sunny', 'Yes'] * math.log2(freq.loc['Sunny', 'Yes'])
  126. Entropy_Rainy = - freq.loc['Rainy', 'No'] * math.log2(freq.loc['Rainy', 'No']) - freq.loc['Rainy', 'Yes'] * math.log2(freq.loc['Rainy', 'Yes'])
  127. GAIN_Outlook = Entropy_All - freqSum.loc['Sunny'] * Entropy_Sunny - freqSum.loc['Rainy'] * Entropy_Rainy
  128. print("Entropy_Sunny = " + str(Entropy_Sunny))
  129. print("Entropy_Rainy = " + str(Entropy_Rainy))
  130. print("GAIN_Outlook = " + str(GAIN_Outlook))
  131. print()
  132.  
  133. # Entropy - Temperature
  134. Entropy_Hot = - temp.loc['Hot', 'No'] * math.log2(temp.loc['Hot', 'No']) - temp.loc['Hot', 'Yes'] * math.log2(temp.loc['Hot', 'Yes'])
  135. Entropy_Cool = - temp.loc['Cool', 'No'] * math.log2(temp.loc['Cool', 'No']) - temp.loc['Cool', 'Yes'] * math.log2(temp.loc['Cool', 'Yes'])
  136. GAIN_Temperature = Entropy_All - tempSum.loc['Hot'] * Entropy_Hot - tempSum.loc['Cool'] * Entropy_Cool
  137. print("Entropy_Hot = " + str(Entropy_Hot))
  138. print("Entropy_Cool = " + str(Entropy_Cool))
  139. print("GAIN_Temperature = " + str(GAIN_Temperature))
  140. print()
  141.  
  142. # Entropy - Humidity
  143. Entropy_High = - hum.loc['High', 'No'] * math.log2(hum.loc['High', 'No']) - hum.loc['High', 'Yes'] * math.log2(hum.loc['High', 'Yes'])
  144. Entropy_Low = - hum.loc['Low', 'No'] * math.log2(hum.loc['Low', 'No']) - hum.loc['Low', 'Yes'] * math.log2(hum.loc['Low', 'Yes'])
  145. GAIN_Humidity = Entropy_All - humSum.loc['High'] * Entropy_High - humSum.loc['Low'] * Entropy_Low
  146. print("Entropy_High = " + str(Entropy_High))
  147. print("Entropy_Low = " + str(Entropy_Low))
  148. print("GAIN_Humidity = " + str(GAIN_Humidity))
  149. print()
  150.  
  151.  
  152. GAINs = [GAIN_Outlook, GAIN_Temperature, GAIN_Humidity]
  153. GAINs_df = pd.DataFrame(GAINs, names)
  154. print(GAINs_df)
  155. print()
  156. MIN = min(GAINs)
  157. MIN_INDEX = GAINs.index(MIN)
  158. print("The most appropriate feature for classification with GAIN is '" + names[MIN_INDEX] + "' with GINI = " + str(MIN))
  159. print()
  160. print()
  161. print()
  162.  
  163.  
  164.  
  165.  
  166.  
  167. # ************************************************************************
  168. # ************************************************************************
  169. # Tree Creation
  170. # ************************************************************************
  171. # ************************************************************************
  172.  
  173. # Encoder
  174. encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
  175. encoder.fit(weather.loc[:, ['Outlook', 'Temperature', 'Humidity']])
  176. transformed = encoder.transform(weather.loc[:, ['Outlook', 'Temperature', 'Humidity']])
  177. # Classification with trees
  178. clf = tree.DecisionTreeClassifier()
  179. clf = clf.fit(transformed, weather.loc[:, 'Play'])
  180. # Plots
  181. fig = plt.figure(figsize=(10, 9))
  182. tree.plot_tree(clf, class_names=['No', 'Yes'], filled=True)
  183. plt.show()
  184. # Text representation
  185. text_representation = tree.export_text(clf)
  186. print(text_representation)
  187. # Prediction of new data
  188. new_data = pd.DataFrame({"Outlook": ["Sunny"], "Temperature": ["Cold"], "Humidity": ["High"]})
  189. transformed_new_data = encoder.transform(new_data)
  190. print(clf.predict(transformed_new_data))
  191. print(clf.predict_proba(transformed_new_data))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement