Advertisement
mis68

hw1_code

Sep 16th, 2019
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.18 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[25]:
  5.  
  6.  
  7. """
  8. Program: hw1.py
  9. Programmed By: Md Mazharul Islam, Roshan Ruvanthika Krishnamurthy, Caul Pearson
  10. Description: A homework file containing the implementation of ID3 Decision Tree
  11. Trace Folder: MdMazharul068
  12. """
  13.  
  14.  
  15. import pandas as pd
  16. training_data = [
  17. ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'}, False),
  18. ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'}, False),
  19. ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
  20. ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
  21. ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
  22. ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, False),
  23. ({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, True),
  24. ({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
  25. ({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
  26. ({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
  27. ({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
  28. ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, True),
  29. ({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),
  30. ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
  31. ]
  32.  
  33.  
  34. # In[26]:
  35.  
  36.  
  37. pd.DataFrame(training_data)
  38. df_input=pd.DataFrame([row[0] for row in training_data])
  39. df_input["Result"]=[row[1] for row in training_data]
  40. print(df_input)
  41.  
  42.  
  43. # In[27]:
  44.  
  45.  
  46. def entropy(probs):
  47.     '''
  48.   Calculates their entropy for list of probabilites
  49.   '''
  50.     import math
  51.     return sum( [-prob*math.log(prob, 2) for prob in probs] )
  52.    
  53.  
  54. def entropy_of_list(a_list):
  55.     '''
  56.   Takes a list of items with discrete values returns the entropy for those items.
  57.   '''
  58.     from collections import Counter
  59.    
  60.     cnt = Counter(x for x in a_list)
  61.  
  62.     num_instances = len(a_list)*1.0
  63.     probs = [x / num_instances for x in cnt.values()]
  64.    
  65.     return entropy(probs)
  66.    
  67. # The initial entropy
  68. # total_entropy = entropy_of_list(df_input['tweets'])
  69. # print (total_entropy)
  70.  
  71.  
  72. # In[28]:
  73.  
  74.  
  75. def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
  76.     '''
  77.   Calculates Information Gain
  78.   '''
  79.    
  80.     # Splitting Data by Possible Values of Attribute:
  81.     df_split = df.groupby(split_attribute_name)
  82.    
  83.     # Calculating Entropy for Target Attribute, as well as a Proportion  in Each Data-Split
  84.     nobs = len(df.index) * 1.0
  85.     df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
  86.     df_agg_ent.columns = ['Entropy', 'PropObservations']
  87.     if trace:
  88.         print(df_agg_ent)
  89.    
  90.     # Calculating Information Gain:
  91.     new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
  92.     old_entropy = entropy_of_list(df[target_attribute_name])
  93.     return old_entropy-new_entropy
  94.  
  95.  
  96. # In[29]:
  97.  
  98.  
  99. def id3(df, target_attribute_name, attribute_names, default_class=None,is_root=1):
  100.    
  101.     ## Calcualting target attribute:
  102.     from collections import Counter
  103.     cnt = Counter(x for x in df[target_attribute_name])
  104.     ## 1st base condition : Is this split of the dataset homogeneous?
  105.     if len(cnt) == 1:
  106.         return list(cnt.keys())[0]
  107.    
  108.     ## 2nd Base Condition: Is this split of the dataset empty?  for empty dataset returning a default value
  109.     elif df.empty or (not attribute_names):
  110.         return default_class
  111.    
  112.     ## Dividing the dataset
  113.     else:
  114.         # Getting Default Value for next recursive call of this function:
  115.         index_of_max = list(cnt.values()).index(max(cnt.values()))
  116.         default_class = list(cnt.keys())[index_of_max] # most common value of target attribute in dataset
  117.         # Selecting Best Attribute to split on:
  118.         gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
  119.         index_of_max = gainz.index(max(gainz))
  120.         best_attr = attribute_names[index_of_max]
  121.         # Initializing the tree
  122.         tree = {best_attr:{}}
  123.         remaining_attribute_names = [i for i in attribute_names if i != best_attr]
  124.         # Spliting dataset and On each split, recursively call this algorithm.
  125.         # Populating empty tree with subtree
  126.         for attr_val, data_subset in df.groupby(best_attr):
  127.             subtree = id3(data_subset,
  128.                         target_attribute_name,
  129.                         remaining_attribute_names,
  130.                         default_class,0)
  131.             tree[best_attr][attr_val] = subtree
  132.             ##adding a default branch to each subtree to classify to the majority class
  133.             bool_value=True
  134.             if(cnt[True]<cnt[False]):
  135.                 bool_value=False
  136.             if(isinstance(subtree,bool)):
  137.                 tree[best_attr][None]=bool_value
  138.            
  139.             if(is_root!=1):
  140.                 tree[best_attr][None]=bool_value
  141. #             else:
  142. #                 tree[None] = True
  143.         return tree
  144.  
  145.  
  146. # In[30]:
  147.  
  148.  
  149. # Getting Predictor Names (all but 'class')
  150. attribute_names = list(df_input.columns)
  151. attribute_names.remove('Result')
  152.  
  153.  
  154. # In[31]:
  155.  
  156.  
  157. # Run Algorithm:
  158. from pprint import pprint
  159.  
  160. tree = id3(df_input, 'Result', attribute_names)
  161. print("-----Decision Tree------")
  162. pprint(tree)
  163.  
  164.  
  165. # In[32]:
  166.  
  167.  
  168. ##Testing tree with inputs
  169. sample1={"level" : "Junior","lang" : "Java","tweets" : "yes","phd" : "no"}
  170. sample2={"level" : "Junior","lang" : "Java","tweets" : "yes","phd" : "yes"}
  171. sample3={"level" : "Intern"}
  172. sample4={"level" : "Senior"}      
  173. def classify(sample_data,tree):
  174. #     pprint(tree)
  175. #     print("Another Tree")
  176.     for key in tree:
  177.         if key in sample_data and sample_data[key] in tree[key]:
  178.             result = tree[key][sample_data[key]]
  179.             if isinstance(result,dict):
  180.                 return classify(sample_data,result)
  181.             else:
  182.                 return result    
  183.         else:
  184.             return tree[key][None]
  185.            
  186.            
  187.  
  188.  
  189. label=classify(sample1,tree)
  190. print("---------Output For Sample 1")  
  191. print(label)
  192.  
  193. label=classify(sample2,tree)
  194. print("---------Output For Sample 2")  
  195. print(label)
  196.  
  197. label=classify(sample3,tree)
  198. print("---------Output For Sample 3")  
  199. print(label)
  200.  
  201. label=classify(sample4,tree)
  202. print("---------Output For Sample 4")  
  203. print(label)
  204.    
  205.  
  206.  
  207. # In[33]:
  208.  
  209.  
  210. ##Dataset For Task 2
  211.  
  212. # Training Data
  213. data = pd.read_csv("car_safety_training.csv",usecols = ['Price','Maintenance','Trunk','Safety','Acceptable'])
  214. # Preview the first 5 lines of the loaded data
  215. df_car_training = pd.DataFrame(data)
  216. print (df_car_training)
  217.  
  218.  
  219. # In[34]:
  220.  
  221.  
  222. ## Dataset for Testing DT
  223. data = pd.read_csv("car_safety_test.csv",usecols = ['Price','Maintenance','Trunk','Safety','Acceptable'])
  224. df_car_test = pd.DataFrame(data)
  225. print(df_car_test)
  226.  
  227.  
  228. # In[35]:
  229.  
  230.  
  231. training_data = df_car_training
  232. testing_data  = df_car_test
  233.  
  234. # Defining the attribute for output
  235. attribute_names = list(df_car_training.columns)
  236. attribute_names.remove('Acceptable')
  237. train_tree = id3(training_data, 'Acceptable', attribute_names)
  238. print("-----Decision Tree------")
  239. pprint(train_tree)
  240.  
  241.  
  242. # In[36]:
  243.  
  244.  
  245. ## Calculating the accuracy of created data
  246.  
  247. data_list= testing_data.iloc[:,:-1].to_dict(orient = "records")
  248. data_list_actual_decision=testing_data.iloc[:,-1:].to_dict(orient = "records")
  249.  
  250. actual_decision=[]
  251. predicted_output=[]
  252. for i in data_list:
  253.     label=classify(i,train_tree)
  254.     predicted_output.append(label)
  255.    
  256. for i in data_list_actual_decision:
  257.     data_class=i['Acceptable']
  258.     actual_decision.append(data_class)
  259.  
  260.    
  261.    
  262. df_actual_label=pd.DataFrame(actual_decision)  
  263. df_pridicted_label=pd.DataFrame(predicted_output)  
  264.  
  265. comparison_data = {
  266.         'Actual': actual_decision,
  267.         'Predicted':predicted_output
  268. }
  269. df_compare = pd.DataFrame(comparison_data, columns = ['Actual', 'Predicted'])
  270. print(df_compare)
  271.  
  272. count=len(actual_decision)
  273. match=0;
  274. for i in range(count):
  275. #     print("Actual Label=",actual_decision[i],"---Predicted  Label",predicted_output[i])
  276.     if(actual_decision[i]==predicted_output[i]):
  277.         match+=1
  278.  
  279. print("Accuracy =",match/count*100,"%")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement