Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- import sys
- #the two system arguments
- train_file = sys.argv[1]
- test_file = sys.argv[2]
- #Nodes used to the create the tree
- class Node:
- def __init__(self, entropy, unqualified_indices, identifier, lists, most):
- self.entropy = entropy
- self.unqualified_indices = unqualified_indices
- self.lists = lists
- self.children = []
- self.identifier = identifier
- self.value = -1
- self.child_index = -1
- self.most = most
- #used for the entropy of the current node
- def entropy(list_of_lists, index):
- unique_value_count = [0,0,0]
- total = 0
- for list in list_of_lists:
- unique_value_count[list[index]] += 1
- total += 1
- entropy = 0
- if total == 0:
- return entropy
- for count in unique_value_count:
- pi = float(count) / float(total)
- if pi != 0:
- entropy += -pi * math.log(pi, 2)
- return entropy
- # split the list and find the individual entropies
- def child_entropy(list_of_lists, index,comp):
- entropy_list = [[],[],[]]
- entropy_values = [0,0,0]
- temp = list_of_lists
- ig = entropy(temp,len(temp[0])-1)
- for i in temp:
- if entropy_list[i[index]] != []:
- entropy_list[i[index]].append(i)
- else:
- entropy_list[i[index]].extend([i])
- count = 0
- for lists in entropy_list:
- entropy_values[count] = entropy(lists, comp)
- ig -= float(len(entropy_list[count]))/len(temp) * entropy_values[count]
- count +=1
- return (ig, entropy_list)
- #information gain
- def information_gain(parent_entropy, list_of_lists, unqualified_indices,comp):
- entropy_values = [None] * (comp +1)
- entropy_list = [None] * (comp + 1)
- best_index = -1
- ig = 0
- for i in range(comp):
- entropy_values[i] = 0
- if i in unqualified_indices:
- continue
- entropy_values[i] = child_entropy(list_of_lists, i,comp)[0]
- if best_index == -1 or entropy_values[i] > entropy_values[best_index]:
- best_index = i
- return best_index
- #creates the tree of Nodes
- def tree(root, titles):
- if root.entropy == 0:
- root.value = get_class(root,len(titles)-1)
- return
- index = information_gain(root.entropy, root.lists ,root.unqualified_indices,len(titles)-1)
- if index == -1:
- root.value = get_class(root,len(titles)-1)
- return
- root.unqualified_indices.append(index)
- temp = child_entropy(root.lists, index, len(titles)-1)[1]
- root.children = []
- x = 0
- for i in temp:
- c_entropy = entropy(i,len(titles)-1)
- id = (titles[index] + " = " + str(x))
- root.child_index = index
- if id == "class = " + str(x):
- id = None
- root.children.append(Node(c_entropy, root.unqualified_indices, id, i,root.most))
- tree(root.children[x],titles)
- x += 1
- root.unqualified_indices.remove(index)
- #The class of a leaf node
- def get_class(root, index):
- unique_value_count = [0,0,0]
- for list in root.lists:
- unique_value_count[list[index]] += 1
- max_index = -1
- max_value = -1
- for i in range(3):
- if unique_value_count[i] > max_value and unique_value_count[i] != 0:
- max_index = i
- max_value = unique_value_count[i]
- elif unique_value_count[i] == max_value and root.most == i:
- max_index = i
- max_value = unique_value_count[i]
- if max_index == -1:
- return root.most
- return max_index
- #print the tree of the function
- def showTree(root,tab):
- if root_entropy == 0:
- if root.identifier != None:
- print(tab + root.identifier + " :")
- for i in root.children:
- if i.value != -1:
- print(tab + i.identifier + " : " + str(i.value))
- elif i.identifier != None:
- print(tab + i.identifier + " :")
- showTree(i,tab + "| ")
- #accuracy function
- def accuracy(root,titles, listoflists):
- right = 0
- wrong = 0
- index = -1
- temp = root
- for i in list_of_lists:
- value = root.value
- temp = root
- while value == -1:
- if temp.child_index != -1:
- temp = temp.children[i[temp.child_index]]
- value = temp.value
- if value == i[len(root.lists[0])-1]:
- right +=1
- value = -2
- else:
- wrong += 1
- value = -2
- return round(float(right) / float(right + wrong),3)
- #load the file into a list of a list
- def load_file(train_f):
- list_of_lists = []
- for line in train_f:
- stripped_line = line.strip()
- line_list = [int(x) for x in stripped_line.split()]
- if len(line_list) > 0:
- list_of_lists.append(line_list)
- return list_of_lists
- #load the headings
- def load_features(train_f):
- titles = []
- line = train_f.readline()
- stripped_line = line.strip()
- titles = stripped_line.split()
- return titles
- #make sure their are two arguments
- if len(sys.argv) != 3:
- print('You must specify only a training data file and test data file in the program parameters; nothing more or less.')
- else:
- #inputting training data
- train_f = open(train_file, 'r')
- titles = load_features(train_f)
- list_of_lists = load_file(train_f)
- train_f.close()
- #create root node
- root_entropy = entropy(list_of_lists, len(list_of_lists[0]) - 1)
- lister = [len(list_of_lists[0]) - 1]
- root = Node (root_entropy, lister, "", list_of_lists,-1)
- root.most = get_class(root,len(list_of_lists[0]) - 1)
- #create the decision tree
- tree(root,titles)
- #show decision tree
- showTree(root , '')
- #show accuracy of training data
- print
- print("Accuracy on training set (" + str(len(list_of_lists)) + " instances): "+ str(100 * accuracy(root,titles, list_of_lists)) + "%")
- #inputting test data
- test_f = open(test_file, 'r')
- line = test_f.readline()
- list_of_lists = load_file(test_f)
- test_f.close()
- #accuracy of test data
- print
- print("Accuracy on test set (" + str(len(list_of_lists)) + " instances): "+ str(100 * accuracy(root,titles, list_of_lists)) + "%")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement