Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # In[25]:
- """
- Program: hw1.py
- Programmed By: Md Mazharul Islam, Roshan Ruvanthika Krishnamurthy, Caul Pearson
- Description: A homework file containing the implementation of ID3 Decision Tree
- Trace Folder: MdMazharul068
- """
- import pandas as pd
- training_data = [
- ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'}, False),
- ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'}, False),
- ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
- ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
- ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
- ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, False),
- ({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, True),
- ({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
- ({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
- ({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
- ({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
- ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, True),
- ({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),
- ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
- ]
- # In[26]:
- pd.DataFrame(training_data)
- df_input=pd.DataFrame([row[0] for row in training_data])
- df_input["Result"]=[row[1] for row in training_data]
- print(df_input)
- # In[27]:
- def entropy(probs):
- '''
- Calculates their entropy for list of probabilites
- '''
- import math
- return sum( [-prob*math.log(prob, 2) for prob in probs] )
- def entropy_of_list(a_list):
- '''
- Takes a list of items with discrete values returns the entropy for those items.
- '''
- from collections import Counter
- cnt = Counter(x for x in a_list)
- num_instances = len(a_list)*1.0
- probs = [x / num_instances for x in cnt.values()]
- return entropy(probs)
- # The initial entropy
- # total_entropy = entropy_of_list(df_input['tweets'])
- # print (total_entropy)
- # In[28]:
- def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
- '''
- Calculates Information Gain
- '''
- # Splitting Data by Possible Values of Attribute:
- df_split = df.groupby(split_attribute_name)
- # Calculating Entropy for Target Attribute, as well as a Proportion in Each Data-Split
- nobs = len(df.index) * 1.0
- df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
- df_agg_ent.columns = ['Entropy', 'PropObservations']
- if trace:
- print(df_agg_ent)
- # Calculating Information Gain:
- new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
- old_entropy = entropy_of_list(df[target_attribute_name])
- return old_entropy-new_entropy
- # In[29]:
- def id3(df, target_attribute_name, attribute_names, default_class=None,is_root=1):
- ## Calcualting target attribute:
- from collections import Counter
- cnt = Counter(x for x in df[target_attribute_name])
- ## 1st base condition : Is this split of the dataset homogeneous?
- if len(cnt) == 1:
- return list(cnt.keys())[0]
- ## 2nd Base Condition: Is this split of the dataset empty? for empty dataset returning a default value
- elif df.empty or (not attribute_names):
- return default_class
- ## Dividing the dataset
- else:
- # Getting Default Value for next recursive call of this function:
- index_of_max = list(cnt.values()).index(max(cnt.values()))
- default_class = list(cnt.keys())[index_of_max] # most common value of target attribute in dataset
- # Selecting Best Attribute to split on:
- gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
- index_of_max = gainz.index(max(gainz))
- best_attr = attribute_names[index_of_max]
- # Initializing the tree
- tree = {best_attr:{}}
- remaining_attribute_names = [i for i in attribute_names if i != best_attr]
- # Spliting dataset and On each split, recursively call this algorithm.
- # Populating empty tree with subtree
- for attr_val, data_subset in df.groupby(best_attr):
- subtree = id3(data_subset,
- target_attribute_name,
- remaining_attribute_names,
- default_class,0)
- tree[best_attr][attr_val] = subtree
- ##adding a default branch to each subtree to classify to the majority class
- bool_value=True
- if(cnt[True]<cnt[False]):
- bool_value=False
- if(isinstance(subtree,bool)):
- tree[best_attr][None]=bool_value
- if(is_root!=1):
- tree[best_attr][None]=bool_value
- # else:
- # tree[None] = True
- return tree
- # In[30]:
- # Getting Predictor Names (all but 'class')
- attribute_names = list(df_input.columns)
- attribute_names.remove('Result')
- # In[31]:
- # Run Algorithm:
- from pprint import pprint
- tree = id3(df_input, 'Result', attribute_names)
- print("-----Decision Tree------")
- pprint(tree)
- # In[32]:
- ##Testing tree with inputs
- sample1={"level" : "Junior","lang" : "Java","tweets" : "yes","phd" : "no"}
- sample2={"level" : "Junior","lang" : "Java","tweets" : "yes","phd" : "yes"}
- sample3={"level" : "Intern"}
- sample4={"level" : "Senior"}
- def classify(sample_data,tree):
- # pprint(tree)
- # print("Another Tree")
- for key in tree:
- if key in sample_data and sample_data[key] in tree[key]:
- result = tree[key][sample_data[key]]
- if isinstance(result,dict):
- return classify(sample_data,result)
- else:
- return result
- else:
- return tree[key][None]
- label=classify(sample1,tree)
- print("---------Output For Sample 1")
- print(label)
- label=classify(sample2,tree)
- print("---------Output For Sample 2")
- print(label)
- label=classify(sample3,tree)
- print("---------Output For Sample 3")
- print(label)
- label=classify(sample4,tree)
- print("---------Output For Sample 4")
- print(label)
- # In[33]:
- ##Dataset For Task 2
- # Training Data
- data = pd.read_csv("car_safety_training.csv",usecols = ['Price','Maintenance','Trunk','Safety','Acceptable'])
- # Preview the first 5 lines of the loaded data
- df_car_training = pd.DataFrame(data)
- print (df_car_training)
- # In[34]:
- ## Dataset for Testing DT
- data = pd.read_csv("car_safety_test.csv",usecols = ['Price','Maintenance','Trunk','Safety','Acceptable'])
- df_car_test = pd.DataFrame(data)
- print(df_car_test)
- # In[35]:
- training_data = df_car_training
- testing_data = df_car_test
- # Defining the attribute for output
- attribute_names = list(df_car_training.columns)
- attribute_names.remove('Acceptable')
- train_tree = id3(training_data, 'Acceptable', attribute_names)
- print("-----Decision Tree------")
- pprint(train_tree)
- # In[36]:
- ## Calculating the accuracy of created data
- data_list= testing_data.iloc[:,:-1].to_dict(orient = "records")
- data_list_actual_decision=testing_data.iloc[:,-1:].to_dict(orient = "records")
- actual_decision=[]
- predicted_output=[]
- for i in data_list:
- label=classify(i,train_tree)
- predicted_output.append(label)
- for i in data_list_actual_decision:
- data_class=i['Acceptable']
- actual_decision.append(data_class)
- df_actual_label=pd.DataFrame(actual_decision)
- df_pridicted_label=pd.DataFrame(predicted_output)
- comparison_data = {
- 'Actual': actual_decision,
- 'Predicted':predicted_output
- }
- df_compare = pd.DataFrame(comparison_data, columns = ['Actual', 'Predicted'])
- print(df_compare)
- count=len(actual_decision)
- match=0;
- for i in range(count):
- # print("Actual Label=",actual_decision[i],"---Predicted Label",predicted_output[i])
- if(actual_decision[i]==predicted_output[i]):
- match+=1
- print("Accuracy =",match/count*100,"%")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement