Untitled

#!/usr/bin/env python
# coding: utf-8

# In[25]:


"""
Program: hw1.py
Programmed By: Md Mazharul Islam, Roshan Ruvanthika Krishnamurthy, Caul Pearson
Description: A homework file containing the implementation of ID3 Decision Tree
Trace Folder: MdMazharul068
"""


import pandas as pd
training_data = [
({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'}, False),
({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'}, False),
({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, False),
({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, True),
({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, True),
({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
]


# In[26]:


pd.DataFrame(training_data)
df_input=pd.DataFrame([row[0] for row in training_data])
df_input["Result"]=[row[1] for row in training_data]
print(df_input)


# In[27]:


def entropy(probs):
    '''
    Calculates their entropy for list of probabilites
    '''
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )


def entropy_of_list(a_list):
    '''
    Takes a list of items with discrete values returns the entropy for those items.
    '''
    from collections import Counter

    cnt = Counter(x for x in a_list)

    num_instances = len(a_list)*1.0
    probs = [x / num_instances for x in cnt.values()]

    return entropy(probs)

# The initial entropy
# total_entropy = entropy_of_list(df_input['tweets'])
# print (total_entropy)


# In[28]:


def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    '''
    Calculates Information Gain
    '''

    # Splitting Data by Possible Values of Attribute:
    df_split = df.groupby(split_attribute_name)

    # Calculating Entropy for Target Attribute, as well as a Proportion  in Each Data-Split
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    if trace:
        print(df_agg_ent)

    # Calculating Information Gain:
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy-new_entropy


# In[29]:


def id3(df, target_attribute_name, attribute_names, default_class=None,is_root=1):

    ## Calcualting target attribute:
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    ## 1st base condition : Is this split of the dataset homogeneous?
    if len(cnt) == 1:
        return list(cnt.keys())[0]

    ## 2nd Base Condition: Is this split of the dataset empty?  for empty dataset returning a default value
    elif df.empty or (not attribute_names):
        return default_class

    ## Dividing the dataset
    else:
        # Getting Default Value for next recursive call of this function:
        index_of_max = list(cnt.values()).index(max(cnt.values()))
        default_class = list(cnt.keys())[index_of_max] # most common value of target attribute in dataset
        # Selecting Best Attribute to split on:
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
        index_of_max = gainz.index(max(gainz))
        best_attr = attribute_names[index_of_max]
        # Initializing the tree
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        # Spliting dataset and On each split, recursively call this algorithm.
        # Populating empty tree with subtree
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class,0)
            tree[best_attr][attr_val] = subtree
            ##adding a default branch to each subtree to classify to the majority class
            bool_value=True
            if(cnt[True]<cnt[False]):
                bool_value=False
            if(isinstance(subtree,bool)):
                tree[best_attr][None]=bool_value

            if(is_root!=1):
                tree[best_attr][None]=bool_value
#             else:
#                 tree[None] = True
        return tree


# In[30]:


# Getting Predictor Names (all but 'class')
attribute_names = list(df_input.columns)
attribute_names.remove('Result')


# In[31]:


# Run Algorithm:
from pprint import pprint

tree = id3(df_input, 'Result', attribute_names)
print("-----Decision Tree------")
pprint(tree)


# In[32]:


##Testing tree with inputs
sample1={"level" : "Junior","lang" : "Java","tweets" : "yes","phd" : "no"}
sample2={"level" : "Junior","lang" : "Java","tweets" : "yes","phd" : "yes"}
sample3={"level" : "Intern"}
sample4={"level" : "Senior"}
def classify(sample_data,tree):
#     pprint(tree)
#     print("Another Tree")
    for key in tree:
        if key in sample_data and sample_data[key] in tree[key]:
            result = tree[key][sample_data[key]]
            if isinstance(result,dict):
                return classify(sample_data,result)
            else:
                return result
        else:
            return tree[key][None]


label=classify(sample1,tree)
print("---------Output For Sample 1")
print(label)

label=classify(sample2,tree)
print("---------Output For Sample 2")
print(label)

label=classify(sample3,tree)
print("---------Output For Sample 3")
print(label)

label=classify(sample4,tree)
print("---------Output For Sample 4")
print(label)


# In[33]:


##Dataset For Task 2

# Training Data
data = pd.read_csv("car_safety_training.csv",usecols = ['Price','Maintenance','Trunk','Safety','Acceptable'])
# Preview the first 5 lines of the loaded data
df_car_training = pd.DataFrame(data)
print (df_car_training)


# In[34]:


## Dataset for Testing DT
data = pd.read_csv("car_safety_test.csv",usecols = ['Price','Maintenance','Trunk','Safety','Acceptable'])
df_car_test = pd.DataFrame(data)
print(df_car_test)


# In[35]:


training_data = df_car_training
testing_data  = df_car_test

# Defining the attribute for output
attribute_names = list(df_car_training.columns)
attribute_names.remove('Acceptable')
train_tree = id3(training_data, 'Acceptable', attribute_names)
print("-----Decision Tree------")
pprint(train_tree)


# In[36]:


## Calculating the accuracy of created data

data_list= testing_data.iloc[:,:-1].to_dict(orient = "records")
data_list_actual_decision=testing_data.iloc[:,-1:].to_dict(orient = "records")

actual_decision=[]
predicted_output=[]
for i in data_list:
    label=classify(i,train_tree)
    predicted_output.append(label)

for i in data_list_actual_decision:
    data_class=i['Acceptable']
    actual_decision.append(data_class)


df_actual_label=pd.DataFrame(actual_decision)
df_pridicted_label=pd.DataFrame(predicted_output)

comparison_data = {
        'Actual': actual_decision,
        'Predicted':predicted_output
}
df_compare = pd.DataFrame(comparison_data, columns = ['Actual', 'Predicted'])
print(df_compare)

count=len(actual_decision)
match=0;
for i in range(count):
#     print("Actual Label=",actual_decision[i],"---Predicted  Label",predicted_output[i])
    if(actual_decision[i]==predicted_output[i]):
        match+=1

print("Accuracy =",match/count*100,"%")