Decision Trees

import math
import sys
#the two system arguments
train_file = sys.argv[1]
test_file = sys.argv[2]

#Nodes used to the create the tree
class Node:
    def __init__(self, entropy, unqualified_indices, identifier, lists, most):
        self.entropy = entropy
        self.unqualified_indices = unqualified_indices
        self.lists = lists
        self.children = []
        self.identifier = identifier
        self.value = -1
        self.child_index = -1
        self.most = most

#used for the entropy of the current node
def entropy(list_of_lists, index):
    unique_value_count = [0,0,0]
    total = 0
    for list in list_of_lists:
        unique_value_count[list[index]] += 1
        total += 1
    entropy = 0
    if total == 0:
        return entropy
    for count in unique_value_count:
        pi = float(count) / float(total)
        if pi != 0:
            entropy += -pi * math.log(pi, 2)


    return entropy

# split the list and find the individual entropies
def child_entropy(list_of_lists, index,comp):
    entropy_list = [[],[],[]]
    entropy_values = [0,0,0]
    temp = list_of_lists
    ig = entropy(temp,len(temp[0])-1)
    for i in temp:
        if entropy_list[i[index]] != []:
            entropy_list[i[index]].append(i)
        else:
            entropy_list[i[index]].extend([i])
    count = 0

    for lists in entropy_list:
        entropy_values[count] = entropy(lists, comp)
        ig -= float(len(entropy_list[count]))/len(temp) * entropy_values[count]
        count +=1
    return (ig, entropy_list)

#information gain
def information_gain(parent_entropy, list_of_lists, unqualified_indices,comp):
    entropy_values = [None] * (comp +1)
    entropy_list = [None] * (comp + 1)
    best_index = -1
    ig = 0
    for i in range(comp):

        entropy_values[i] = 0
        if i in unqualified_indices:
            continue
        entropy_values[i] = child_entropy(list_of_lists, i,comp)[0]
        if best_index == -1 or entropy_values[i] > entropy_values[best_index]:
            best_index = i

    return best_index
#creates the tree of Nodes
def tree(root, titles):
    if root.entropy == 0:
        root.value = get_class(root,len(titles)-1)
        return
    index =  information_gain(root.entropy, root.lists ,root.unqualified_indices,len(titles)-1)
    if index == -1:
        root.value = get_class(root,len(titles)-1)
        return
    root.unqualified_indices.append(index)
    temp = child_entropy(root.lists, index, len(titles)-1)[1]
    root.children = []
    x = 0
    for i in temp:
        c_entropy = entropy(i,len(titles)-1)
        id = (titles[index] + " = " + str(x))
        root.child_index = index
        if id == "class = " + str(x):
            id = None
        root.children.append(Node(c_entropy, root.unqualified_indices, id, i,root.most))
        tree(root.children[x],titles)
        x += 1
    root.unqualified_indices.remove(index)

#The class of a leaf node
def get_class(root, index):
    unique_value_count = [0,0,0]
    for list in root.lists:
        unique_value_count[list[index]] += 1
    max_index = -1
    max_value = -1
    for i in range(3):
        if unique_value_count[i] > max_value and unique_value_count[i] != 0:
            max_index = i
            max_value = unique_value_count[i]
        elif unique_value_count[i] == max_value and root.most == i:
            max_index = i
            max_value = unique_value_count[i]

    if max_index == -1:
        return root.most

    return max_index

#print the tree of the function
def showTree(root,tab):
    if root_entropy == 0:
        if root.identifier != None:
            print(tab + root.identifier + " :")

    for i in root.children:
        if i.value != -1:
            print(tab + i.identifier + " : " + str(i.value))
        elif i.identifier != None:
            print(tab + i.identifier + " :")
        showTree(i,tab + "| ")

#accuracy function
def accuracy(root,titles, listoflists):
    right = 0
    wrong = 0
    index = -1
    temp = root
    for i in list_of_lists:
        value = root.value
        temp = root
        while value == -1:
            if temp.child_index != -1:
                temp = temp.children[i[temp.child_index]]
                value = temp.value

        if value == i[len(root.lists[0])-1]:
            right +=1
            value = -2
        else:
            wrong += 1
            value = -2

    return round(float(right) / float(right + wrong),3)

#load the file into a list of a list
def load_file(train_f):

    list_of_lists = []
    for line in train_f:
        stripped_line = line.strip()
        line_list = [int(x) for x in stripped_line.split()]
        if len(line_list) > 0:
            list_of_lists.append(line_list)
    return list_of_lists

#load the headings
def load_features(train_f):
    titles = []
    line = train_f.readline()
    stripped_line = line.strip()
    titles = stripped_line.split()
    return titles

#make sure their are two arguments
if len(sys.argv) != 3:
    print('You must specify only a training data file and test data file in the program parameters; nothing more or less.')
else:
    #inputting training data
    train_f = open(train_file, 'r')
    titles =  load_features(train_f)
    list_of_lists = load_file(train_f)
    train_f.close()

    #create root node
    root_entropy = entropy(list_of_lists, len(list_of_lists[0]) - 1)
    lister = [len(list_of_lists[0]) - 1]
    root = Node (root_entropy, lister, "", list_of_lists,-1)
    root.most = get_class(root,len(list_of_lists[0]) - 1)
    #create the decision tree
    tree(root,titles)
    #show decision tree
    showTree(root , '')
    #show accuracy of training data
    print
    print("Accuracy on training set (" + str(len(list_of_lists)) + " instances): "+ str(100 * accuracy(root,titles, list_of_lists)) + "%")
    #inputting test data
    test_f = open(test_file, 'r')
    line = test_f.readline()
    list_of_lists = load_file(test_f)
    test_f.close()
    #accuracy of test data
    print
    print("Accuracy on test set (" + str(len(list_of_lists)) + " instances): "+ str(100 * accuracy(root,titles, list_of_lists)) + "%")