CJamie

decisionTree

Mar 23rd, 2022
642
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.25 KB | None | 0 0
  1. # Q1 DECISION TREE -- SHRUTIKA PANDEY 102095004
  2. """
  3.  
  4. # Commented out IPython magic to ensure Python compatibility.
  5. import pandas as pd
  6. import numpy as np
  7. import warnings
  8. import matplotlib.pyplot as plt
  9. warnings.filterwarnings('ignore')
  10. # %matplotlib inline
  11.  
  12. """# Decision tree Dataset"""
  13.  
  14. data_dict = {
  15.     'Outlook' : ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny','Overcast', 'Overcast', 'Rainy', 'Rainy', 'Rainy']
  16.     ,'Temperature': ['Hot', 'Hot', 'Hot', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild','Hot','Mild', 'Cool', 'Mild']
  17.     ,'Humidity' : ['High', 'High', 'High', 'Normal', 'Normal', 'High', 'Normal', 'Normal','Normal','High', 'Normal', 'High', 'Normal', 'High']
  18.     ,'Wind': ['False', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'True', 'False']
  19.     ,'PlayGolf': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes']
  20. }
  21. golf_data = pd.DataFrame(data_dict, columns=data_dict.keys())
  22. golf_data
  23.  
  24. """# Entropy"""
  25.  
  26. # Commented out IPython magic to ensure Python compatibility.
  27. # %%latex
  28. #
  29. # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
  30.  
  31. def entropy_calculate(prob_list):
  32.  
  33.     entropy = 0
  34.     for item in prob_list:
  35.         entropy -= item * np.log2(item)
  36.     return entropy
  37.  
  38. cases,counts = np.unique(golf_data.PlayGolf,return_counts=True)
  39. P = [count/len(golf_data) for count in counts]
  40. print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
  41.  
  42. entropy_entire = entropy_calculate(P)
  43.  
  44. print('Entire syetems entropy is %.3f bits'%entropy_entire)
  45.  
  46. """# Information Gain
  47.  
  48. # Outlook decision
  49. """
  50.  
  51. cases_outlook,counts_outlook= np.unique(golf_data.Outlook,return_counts=True)
  52. P_outlook = [count/len(golf_data) for count in counts_outlook]
  53. print('For outlook:')
  54. for case, prob in zip(cases_outlook,P_outlook):
  55.     print('\tProbabality of %s is %.3f'%(case, prob))
  56.  
  57. entropy_outlook={}
  58. total_entropy_outlook=0
  59. for case, prob in zip(cases_outlook,P_outlook):
  60.     cases,counts = np.unique(golf_data.PlayGolf[golf_data.Outlook==case],return_counts=True)
  61.     P = [count/len(golf_data[golf_data.Outlook==case]) for count in counts]
  62.     entropy_outlook[case]=entropy_calculate(P)
  63.     total_entropy_outlook += entropy_calculate(P)*prob
  64.  
  65. for case, entropy in entropy_outlook.items():
  66.     print('Entropy for %s is %.2f'%(case,entropy))
  67. print('\nEntropy at Outlook decision level is %.3f'%total_entropy_outlook)
  68. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_outlook))
  69.  
  70. """# Temperature Decision"""
  71.  
  72. cases_temperature,counts_temperature= np.unique(golf_data.Temperature,return_counts=True)
  73. P_temperature = [count/len(golf_data) for count in counts_temperature]
  74. print('For temperature:')
  75. for case, prob in zip(cases_temperature,P_temperature):
  76.     print('\tProbabality of %s is %.3f'%(case, prob))
  77.  
  78. entropy_temperature={}
  79. total_entropy_temperature=0
  80. for case, prob in zip(cases_temperature,P_temperature):
  81.     cases,counts = np.unique(golf_data.PlayGolf[golf_data.Temperature==case],return_counts=True)
  82.     P = [count/len(golf_data[golf_data.Temperature==case]) for count in counts]
  83.     entropy_temperature[case]=entropy_calculate(P)
  84.     total_entropy_temperature += entropy_calculate(P)*prob
  85.  
  86. for case, entropy in entropy_temperature.items():
  87.     print('Entropy for %s is %.2f'%(case,entropy))
  88. print('\nEntropy at Temperature decision level is %.3f'%total_entropy_temperature)
  89. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_temperature))
  90.  
  91. """# Wind Decision"""
  92.  
  93. cases_wind,counts_wind= np.unique(golf_data.Wind,return_counts=True)
  94. P_wind = [count/len(golf_data) for count in counts_wind]
  95. print('For wind:')
  96. for case, prob in zip(cases_wind,P_wind):
  97.     print('\tProbabality of %s is %.3f'%(case, prob))
  98.  
  99. entropy_wind={}
  100. total_entropy_wind=0
  101. for case, prob in zip(cases_wind,P_wind):
  102.     cases,counts = np.unique(golf_data.PlayGolf[golf_data.Wind==case],return_counts=True)
  103.     P = [count/len(golf_data[golf_data.Wind==case]) for count in counts]
  104.     entropy_wind[case]=entropy_calculate(P)
  105.     total_entropy_wind += entropy_calculate(P)*prob
  106.  
  107. for case, entropy in entropy_wind.items():
  108.     print('Entropy for %s is %.2f'%(case,entropy))
  109. print('\nEntropy at Wind decision level is %.3f'%total_entropy_wind)
  110. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_wind))
  111.  
  112. """# Humidity Decision"""
  113.  
  114. cases_humidity,counts_humidity= np.unique(golf_data.Humidity,return_counts=True)
  115. P_humidity = [count/len(golf_data) for count in counts_humidity]
  116. print('For humidity:')
  117. for case, prob in zip(cases_humidity,P_humidity):
  118.     print('\tProbabality of %s is %.3f'%(case, prob))
  119.  
  120. entropy_humidity={}
  121. total_entropy_humidity=0
  122. for case, prob in zip(cases_humidity,P_humidity):
  123.     cases,counts = np.unique(golf_data.PlayGolf[golf_data.Humidity==case],return_counts=True)
  124.     P = [count/len(golf_data[golf_data.Humidity==case]) for count in counts]
  125.     entropy_humidity[case]=entropy_calculate(P)
  126.     total_entropy_humidity += entropy_calculate(P)*prob
  127.  
  128. for case, entropy in entropy_humidity.items():
  129.     print('Entropy for %s is %.2f'%(case,entropy))
  130. print('\nEntropy at Humidity decision level is %.3f'%total_entropy_humidity)
  131. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_humidity))
  132.  
  133. #Training
  134. training_data = golf_data[['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']]
  135. training_data.head()
  136.  
  137. category_map ={}
  138. for column in ['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']:
  139.     category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
  140.     training_data[column] = training_data[column].astype('category').cat.codes
  141. training_data.head()
  142.  
  143. training_data.dropna(inplace=True)
  144. training_data.reset_index(drop=True, inplace=True)
  145. print('Total number of valid records: {}'.format(len(training_data)))
  146.  
  147. """# Split to train and test data"""
  148.  
  149. from sklearn.model_selection import train_test_split
  150.  
  151. X=training_data[['Outlook', 'Temperature', 'Humidity', 'Wind']]
  152. y=training_data[['PlayGolf']]
  153.  
  154. from sklearn.model_selection import train_test_split
  155. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
  156.  
  157. print('Total number of records used for training: {}\nTotal number of records used for testing: {}'.format(len(X_train),len(X_test)))
  158.  
  159. """# Build a decision tree"""
  160.  
  161. from sklearn.tree import DecisionTreeClassifier
  162.  
  163. X = X_train[['Outlook', 'Temperature', 'Humidity', 'Wind']]
  164. y = y_train[['PlayGolf']]
  165.  
  166. clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
  167.  
  168. clf = clf.fit(X, y)
  169.  
  170. clf
  171.  
  172. clf.feature_importances_
  173.  
  174. """# Tree visualization"""
  175.  
  176. from sklearn import tree
  177.  
  178. #X[0] -> Outlook,X[1] -> Temperature,X[2] -> Humidity,X[3] -> Wind
  179. fig, ax = plt.subplots(figsize=(10, 10))
  180. tree.plot_tree(clf, fontsize=10)
  181. plt.show()
  182.  
  183. """# Evaluate the tree"""
  184.  
  185. from sklearn.metrics import accuracy_score
  186.  
  187. predictions = clf.predict(X_test)
  188.  
  189. accuracy_score(y_test,predictions)
  190.  
  191. y_pred = clf.predict(X_test)
  192. print(y_pred)
  193.  
  194. #predicting for tommorow and today
  195. today_tommorow={
  196.     'Outlook':['Sunny','Sunny'],
  197.     'Temperature': ['Cool', 'Mild'],
  198.     'Humidity' : ['Normal', 'Normal'],
  199.     'Wind': ['False', 'False']
  200. }
  201. todtom_data = pd.DataFrame(today_tommorow, columns=today_tommorow.keys())
  202. todtom_data
  203.  
  204. category_map ={}
  205. for column in ['Outlook', 'Temperature', 'Humidity', 'Wind']:
  206.     category_map[column] = dict( enumerate(todtom_data[column].astype('category').cat.categories) )
  207.     todtom_data[column] = todtom_data[column].astype('category').cat.codes
  208. todtom_data.head()
  209.  
  210. todtom_data.dropna(inplace=True)
  211. todtom_data.reset_index(drop=True, inplace=True)
  212. print('Total number of valid records: {}'.format(len(todtom_data)))
  213.  
  214. p = clf.predict(todtom_data)
  215.  
  216. pred=['No','No']
  217. for i in range(0,2):
  218.     if (p[i]==1) :
  219.         pred[i]='Yes'
  220.  
  221. print(f'prediction for today is {pred[0]}')
  222. print(f'prediction for tommorow is {pred[1]}')
  223.  
  224. """# A) GINI IMPURITY -- SHRUTIKA PANDEY 102095004"""
  225.  
  226. import pandas as pd
  227. import numpy as np
  228. import matplotlib.pyplot as plt
  229. import seaborn as sns
  230.  
  231. data_dict = {
  232.     'Outlook' : ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny','Overcast', 'Overcast', 'Rainy', 'Rainy', 'Rainy']
  233.     ,'Temperature': ['Hot', 'Hot', 'Hot', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild','Hot','Mild', 'Cool', 'Mild']
  234.     ,'Humidity' : ['High', 'High', 'High', 'Normal', 'Normal', 'High', 'Normal', 'Normal','Normal','High', 'Normal', 'High', 'Normal', 'High']
  235.     ,'Wind': ['False', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'True', 'False']
  236.     ,'PlayGolf': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes']
  237. }
  238. golf_data = pd.DataFrame(data_dict, columns=data_dict.keys())
  239. golf_data
  240.  
  241. #Training
  242. training_data = golf_data[['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']]
  243. training_data.head()
  244.  
  245. category_map ={}
  246. for column in ['Outlook', 'Temperature', 'Humidity', 'Wind','PlayGolf']:
  247.     category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
  248.     training_data[column] = training_data[column].astype('category').cat.codes
  249. training_data.head()
  250.  
  251. training_data.dropna(inplace=True)
  252. training_data.reset_index(drop=True, inplace=True)
  253. print('Total number of valid records: {}'.format(len(training_data)))
  254.  
  255. from sklearn.model_selection import train_test_split
  256.  
  257. X=training_data[['Outlook', 'Temperature', 'Humidity', 'Wind']]
  258. y=training_data[['PlayGolf']]
  259.  
  260. from sklearn.model_selection import train_test_split
  261. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
  262.  
  263. print('Total number of records used for training: {}\nTotal number of records used for testing: {}'.format(len(X_train),len(X_test)))
  264.  
  265. from sklearn.tree import DecisionTreeClassifier
  266.  
  267. X = X_train[['Outlook', 'Temperature', 'Humidity', 'Wind']]
  268. y = y_train[['PlayGolf']]
  269.  
  270. clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='gini')
  271.  
  272. clf = clf.fit(X, y)
  273.  
  274. clf.feature_importances_
  275.  
  276. from sklearn import tree
  277.  
  278. #X[0] -> Outlook,X[1] -> Temperature,X[2] -> Humidity,X[3] -> Wind
  279. fig, ax = plt.subplots(figsize=(10, 10))
  280. tree.plot_tree(clf, fontsize=10)
  281. plt.show()
  282.  
  283. from sklearn.metrics import accuracy_score
  284.  
  285. predictions = clf.predict(X_test)
  286.  
  287. accuracy_score(y_test,predictions)
  288.  
  289. y_pred = clf.predict(X_test)
  290. print(y_pred)
  291.  
  292. #predicting for tommorow and today
  293. today_tommorow={
  294.     'Outlook':['Sunny','Sunny'],
  295.     'Temperature': ['Cool', 'Mild'],
  296.     'Humidity' : ['Normal', 'Normal'],
  297.     'Wind': ['False', 'False']
  298. }
  299. todtom_data = pd.DataFrame(today_tommorow, columns=today_tommorow.keys())
  300. todtom_data
  301.  
  302. category_map ={}
  303. for column in ['Outlook', 'Temperature', 'Humidity', 'Wind']:
  304.     category_map[column] = dict( enumerate(todtom_data[column].astype('category').cat.categories) )
  305.     todtom_data[column] = todtom_data[column].astype('category').cat.codes
  306. todtom_data.head()
  307.  
  308. todtom_data.dropna(inplace=True)
  309. todtom_data.reset_index(drop=True, inplace=True)
  310. print('Total number of valid records: {}'.format(len(todtom_data)))
  311.  
  312. p = clf.predict(todtom_data)
  313.  
  314. pred=['No','No']
  315. for i in range(0,2):
  316.     if (p[i]==1) :
  317.         pred[i]='Yes'
  318.  
  319. print(f'prediction for today is {pred[0]}')
  320. print(f'prediction for tommorow is {pred[1]}')
  321.  
  322.  
  323.  
  324. """# Q2 MULTIPLE LINEAR REGRESSSION - 102095004
  325.  
  326. # B) MULTIPLE REGRESSION
  327. """
  328.  
  329. import pandas as pd
  330. import numpy as np
  331. import matplotlib.pyplot as plt
  332. import seaborn as sns
  333.  
  334. data = pd.read_csv('BostonHousing.csv')
  335. data.head()
  336.  
  337. #data=pd.read_csv('BostonHousing.csv')
  338. # X= pd.DataFrame(data.iloc[:,:-1])
  339. # y= pd.DataFrame(data.iloc[:,-1])
  340. X = data.iloc[:, :-1].values
  341. y = data.iloc[:, -1].values
  342. print(X)
  343.  
  344. print(y)
  345.  
  346. sns.heatmap(data.corr())
  347.  
  348. from sklearn.model_selection import train_test_split
  349. X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=50)
  350.  
  351. from sklearn.linear_model import LinearRegression
  352. lr = LinearRegression()
  353. lr.fit(X_train,y_train)
  354.  
  355. y_pred = lr.predict(X_test)
  356. print(y_pred)
  357.  
  358. from sklearn.metrics import r2_score
  359. r2_score(y_test,y_pred)
  360.  
  361. plt.scatter(y_test,y_pred)
  362. plt.xlabel('actual')
  363. plt.ylabel('predicted')
  364. plt.title('actual vs predicted')
  365. plt.show()
  366.  
  367. pred_y_dataset = pd.DataFrame({'Actual':y_test,'Predicted':y_pred,'Difference': y_test-y_pred})
  368. pred_y_dataset[0:10]
  369.  
  370. y_pred=lr.predict(X_test)
  371. y_pred=pd.DataFrame(y_pred,columns=['Predicted value'])
  372. y_pred
  373.  
  374. y_test
  375.  
  376. #coeff_df=pd.concat([w,v],axis=1,join='inner')
  377. #coeff_df
  378.  
  379. from sklearn import metrics
  380. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
  381. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
  382. print('Resultant Mean Square Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
  383.  
  384. """# C) DECISION TREE FOR IRIS DATASET -- SHRUTIKA PANDEY 102095004"""
  385.  
  386. import pandas as pd
  387. import numpy as np
  388. import warnings
  389. import matplotlib.pyplot as plt
  390. warnings.filterwarnings('ignore')
  391.  
  392. dataset = pd.read_csv('Iris.csv')
  393. dataset
  394.  
  395. # Commented out IPython magic to ensure Python compatibility.
  396. # %%latex
  397. #
  398. # Entropy = $-\sum_{i=1}^{n} P_i\times Log_b(P_i)$
  399.  
  400. def entropy_calculate(prob_list):
  401.  
  402.     entropy = 0
  403.     for item in prob_list:
  404.         entropy -= item * np.log2(item)
  405.     return entropy
  406.  
  407. cases,counts = np.unique(dataset.Species,return_counts=True)
  408. P = [count/len(dataset) for count in counts]
  409. print('Probabilities of %s and %s are %.3f, %.3f respectively'%(cases[0],cases[1],P[0],P[1]))
  410.  
  411. entropy_entire = entropy_calculate(P)
  412.  
  413. print('Entire syetems entropy is %.3f bits'%entropy_entire)
  414.  
  415. cases_SepalLengthCm,counts_SepalLengthCm= np.unique(dataset.SepalLengthCm,return_counts=True)
  416. P_SepalLengthCm = [count/len(dataset) for count in counts_SepalLengthCm]
  417. print('For SepalLengthCm:')
  418. for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
  419.     print('\tProbabality of %s is %.3f'%(case, prob))
  420.  
  421. entropy_SepalLengthCm={}
  422. total_entropy_SepalLengthCm=0
  423. for case, prob in zip(cases_SepalLengthCm,P_SepalLengthCm):
  424.     cases,counts = np.unique(dataset.Species[dataset.SepalLengthCm==case],return_counts=True)
  425.     P = [count/len(dataset[dataset.SepalLengthCm==case]) for count in counts]
  426.     entropy_SepalLengthCm[case]=entropy_calculate(P)
  427.     total_entropy_SepalLengthCm += entropy_calculate(P)*prob
  428.  
  429. for case, entropy in entropy_SepalLengthCm.items():
  430.     print('Entropy for %s is %.2f'%(case,entropy))
  431. print('\nEntropy at SepalLengthCm decision level is %.3f'%total_entropy_SepalLengthCm)
  432. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalLengthCm))
  433.  
  434. cases_SepalWidthCm,counts_SepalWidthCm= np.unique(dataset.SepalWidthCm,return_counts=True)
  435. P_SepalWidthCm = [count/len(dataset) for count in counts_SepalWidthCm]
  436. print('For SepalWidthCm:')
  437. for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
  438.     print('\tProbabality of %s is %.3f'%(case, prob))
  439.  
  440. entropy_SepalWidthCm={}
  441. total_entropy_SepalWidthCm=0
  442. for case, prob in zip(cases_SepalWidthCm,P_SepalWidthCm):
  443.     cases,counts = np.unique(dataset.Species[dataset.SepalWidthCm==case],return_counts=True)
  444.     P = [count/len(dataset[dataset.SepalWidthCm==case]) for count in counts]
  445.     entropy_SepalWidthCm[case]=entropy_calculate(P)
  446.     total_entropy_SepalWidthCm += entropy_calculate(P)*prob
  447.  
  448. for case, entropy in entropy_SepalWidthCm.items():
  449.     print('Entropy for %s is %.2f'%(case,entropy))
  450. print('\nEntropy at SepalWidthCm decision level is %.3f'%total_entropy_SepalWidthCm)
  451. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_SepalWidthCm))
  452.  
  453. cases_PetalLengthCm,counts_PetalLengthCm= np.unique(dataset.PetalLengthCm,return_counts=True)
  454. P_PetalLengthCm = [count/len(dataset) for count in counts_PetalLengthCm]
  455. print('For PetalLengthCm:')
  456. for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
  457.     print('\tProbabality of %s is %.3f'%(case, prob))
  458.  
  459. entropy_PetalLengthCm={}
  460. total_entropy_PetalLengthCm=0
  461. for case, prob in zip(cases_PetalLengthCm,P_PetalLengthCm):
  462.     cases,counts = np.unique(dataset.Species[dataset.PetalLengthCm==case],return_counts=True)
  463.     P = [count/len(dataset[dataset.PetalLengthCm==case]) for count in counts]
  464.     entropy_PetalLengthCm[case]=entropy_calculate(P)
  465.     total_entropy_PetalLengthCm += entropy_calculate(P)*prob
  466.  
  467. for case, entropy in entropy_PetalLengthCm.items():
  468.     print('Entropy for %s is %.2f'%(case,entropy))
  469. print('\nEntropy at PetalLengthCm decision level is %.3f'%total_entropy_PetalLengthCm)
  470. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalLengthCm))
  471.  
  472. cases_PetalWidthCm,counts_PetalWidthCm= np.unique(dataset.PetalWidthCm,return_counts=True)
  473. P_PetalWidthCm = [count/len(dataset) for count in counts_PetalWidthCm]
  474. print('For PetalWidthCm:')
  475. for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
  476.     print('\tProbabality of %s is %.3f'%(case, prob))
  477.  
  478. entropy_PetalWidthCm={}
  479. total_entropy_PetalWidthCm=0
  480. for case, prob in zip(cases_PetalWidthCm,P_PetalWidthCm):
  481.     cases,counts = np.unique(dataset.Species[dataset.PetalWidthCm==case],return_counts=True)
  482.     P = [count/len(dataset[dataset.PetalWidthCm==case]) for count in counts]
  483.     entropy_PetalWidthCm[case]=entropy_calculate(P)
  484.     total_entropy_PetalWidthCm += entropy_calculate(P)*prob
  485.  
  486. for case, entropy in entropy_PetalWidthCm.items():
  487.     print('Entropy for %s is %.2f'%(case,entropy))
  488. print('\nEntropy at PetalWidthCm decision level is %.3f'%total_entropy_PetalWidthCm)
  489. print('\nInformation gain is %.3f'%(entropy_entire- total_entropy_PetalWidthCm))
  490.  
  491. #Training
  492. training_data = dataset[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']]
  493. training_data.head()
  494.  
  495. category_map ={}
  496. for column in ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']:
  497.     category_map[column] = dict( enumerate(training_data[column].astype('category').cat.categories) )
  498.     training_data[column] = training_data[column].astype('category').cat.codes
  499. training_data.head()
  500.  
  501. training_data.dropna(inplace=True)
  502. training_data.reset_index(drop=True, inplace=True)
  503. print('Total number of valid records: {}'.format(len(training_data)))
  504.  
  505. from sklearn.model_selection import train_test_split
  506.  
  507. X=training_data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
  508. y=training_data[['Species']]
  509.  
  510. from sklearn.model_selection import train_test_split
  511. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
  512.  
  513. print('Total number of records used for training: {}\nTotal number of records used for testin: {}'.format(len(X_train),len(X_test)))
  514.  
  515. from sklearn.tree import DecisionTreeClassifier
  516.  
  517. X = X_train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
  518. y = y_train[['Species']]
  519.  
  520. clf = DecisionTreeClassifier(max_leaf_nodes=25, criterion='entropy')
  521.  
  522. clf = clf.fit(X, y)
  523.  
  524. clf
  525.  
  526. clf.feature_importances_
  527.  
  528. from sklearn import tree
  529.  
  530. #X[0] -> SepalLengthCm,X[1] -> SepalWidthCm,X[2] -> PetalLengthCm,X[3] ->PetalWidthCm
  531. fig, ax = plt.subplots(figsize=(10, 10))
  532. tree.plot_tree(clf, fontsize=10)
  533. plt.show()
  534.  
  535. from sklearn.metrics import accuracy_score
  536.  
  537. predictions = clf.predict(X_test)
  538.  
  539. accuracy_score(y_test,predictions)
  540.  
  541. y_pred = clf.predict(X_test)
  542. print(y_pred)
  543.  
  544.  
  545.  
  546.  
Advertisement
Add Comment
Please, Sign In to add comment