Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- from random import randrange
- trainPath = "train.csv"
- testPath = "test.csv"
- def load_csv(path):
- '''
- load dataset from csv
- @dataset in this format:
- [
- [x1, y1],
- [x2, y2],
- [x3, y3]
- ]
- here x1 is the 11 features in this cases
- '''
- dataset,header = [],[]
- with open(path,"r") as f:
- header=[h.strip() for h in f.readline().split(",")]
- for line in f:
- dataset.append([float(feature.strip()) for feature in line.split(",")])
- dataset = [row[:-1]+[1] if row[-1] > 6 else row[:-1]+[0] for row in dataset]#change the last row into 1 or zero
- return dataset
- def get_accuracy(y, y_p):
- '''
- return the accuracy given actual y and the predicted y (y_p)
- '''
- return sum(a == b for a, b in zip(y, y_p))/len(y) * 100
- def cv(dataset, k_fold):
- '''
- return dataset into k pieces with equal length
- '''
- copy, fold_size = dataset[:], len(dataset)//k_fold
- return [[copy.pop(randrange(len(copy))) for _ in range(fold_size)] for i in range(k_fold)]
- def evaluate(dataset, model, k_fold, max_depth = 8, min_size = 5):
- '''
- 1. prepare dataset for K-Fold cross validation at each iteration
- 2. record accuracy for each iteration in the list called acc
- '''
- folds,acc = cv(dataset, k_fold),[]
- for i in range(len(folds)):
- test_set,train_set = folds[i],[]
- for j in range(len(folds)):
- if j == i: continue #this for loop is to complement the train set
- train_set += folds[j]
- tree = model(train_set, max_depth, min_size)
- y_p = predict(tree, test_set)
- accuracy = get_accuracy(get_y(test_set), y_p)
- acc.append(accuracy)
- return tree, acc
- def _split(i, threshold, dataset):
- '''
- split dataset into two pieces given threshold and the index of feature
- '''
- left = [row for row in dataset if row[i] < threshold]
- right = [row for row in dataset if row[i] >= threshold]
- return left, right
- def get_gini(left, right, classes):
- '''
- calculate gini
- '''
- n1, n2= len(left), len(right)
- n = n1 + n2
- gini_left = 0 if n1 == 0 else (1 - sum([([row[-1] for row in left].count(c)/n1)**2 for c in classes])) * (n1/n) #sum([([row[-1] for row in left].count(c)/n1)**2 for c in classes])) is pk since above 6 is 1 while under is 0 which will not affect the count
- gini_right = 0 if n2 == 0 else (1 - sum([([row[-1] for row in right].count(c)/n2)**2 for c in classes])) * (n2/n)
- return gini_left + gini_right
- def get_split(dataset):
- '''
- go through every feature in every row find the best cut that gives the most improvements based on gini
- '''
- classes = {row[-1] for row in dataset}
- _i, _threshold, _gini, _children = float('inf'), float('inf'), float('inf'), None
- for i in range(len(dataset[0])-1): # feature except the quality
- for row in dataset:
- children = _split(i, row[i], dataset) #to check each value inside the ith feature in order to find the best cut
- left, right = children
- gini = get_gini(left, right, classes)
- if gini < _gini:
- _i, _threshold, _gini, _children = i, row[i], gini, children
- return {'i':_i, 'threshold':_threshold, 'children':_children}
- def get_y(dataset):
- '''
- return the last column which is the y
- '''
- return [row[-1] for row in dataset]
- def leaf(dataset):
- '''
- return the majority class in the given dataset
- '''
- y = get_y(dataset)
- return max(y, key = y.count)
- def split(node, max_depth, min_size, depth):
- '''
- build the tree until three conditions are met:
- 1. max depth
- 2. min size in the leaf node
- 3. only one class left
- '''
- left, right = node['children']
- if not left or not right:
- node['left'] = node['right'] = leaf(left + right)
- return
- if depth <= max_depth and len(left) >= min_size:
- node['left'] = get_split(left)
- split(node['left'], max_depth, min_size, depth+1)
- else:
- node['left'] = leaf(left)
- if depth <= max_depth and len(right) >= min_size:
- node['right'] = get_split(right)
- split(node['right'], max_depth, min_size, depth+1)
- else:
- node['right'] = leaf(right)
- def fit(dataset, max_depth, min_size):
- '''
- build the tree and keep track of the root node
- '''
- root = get_split(dataset)
- split(root, max_depth, min_size, 1)
- return root
- def _predict(node, row):
- '''
- for a row, go through the tree and return the class stored in the leaf
- '''
- if row[node['i']] < node['threshold']:
- return node['left'] if type(node['left']) == int else _predict(node['left'], row)
- else:
- return node['right'] if type(node['right']) == int else _predict(node['right'], row)
- def model(train, max_depth, min_size):
- tree = fit(train, max_depth, min_size)
- return tree
- def predict(tree, test):
- return [_predict(tree, row) for row in test]
- '''
- default value
- '''
- k_fold = 3
- max_depth = 8
- min_size = 5
- trainPath = "train.csv"
- testPath = "test.csv"
- if len(sys.argv) > 1:
- k_fold = int(sys.argv[1])
- if len(sys.argv) > 2:
- max_depth = int(sys.argv[2])
- if len(sys.argv) > 3:
- min_size = int(sys.argv[3])
- if len(sys.argv) > 5:
- trainPath = sys.argv[4]
- testPath = sys.argv[5]
- train = load_csv(trainPath)
- tree, acc = evaluate(train, model, k_fold, max_depth, min_size)
- print('CV Accuracy: %s' % acc)
- print('Mean Accuracy: %.3f%%' % (sum(acc)/k_fold))
- test = load_csv(testPath)
- y_p = predict(tree, test)
- print("Test Accuracy: %.3f%%" % get_accuracy (get_y(test), y_p))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement