csc1001 group project part4

import sys
from random import randrange

trainPath = "train.csv"
testPath = "test.csv"

def load_csv(path):
    '''
    load dataset from csv
    @dataset in this format:
    [
        [x1, y1],
        [x2, y2],
        [x3, y3]
    ]
    here x1 is the 11 features in this cases
    '''
    dataset,header = [],[]
    with open(path,"r") as f:
        header=[h.strip() for h in f.readline().split(",")]
        for line in f:
            dataset.append([float(feature.strip()) for feature in line.split(",")])
    dataset = [row[:-1]+[1] if row[-1] > 6 else row[:-1]+[0] for row in dataset]#change the last row into 1 or zero
    return dataset

def get_accuracy(y, y_p):
    '''
    return the accuracy given actual y and the predicted y (y_p)
    '''
    return sum(a == b for a, b in zip(y, y_p))/len(y) * 100

def cv(dataset, k_fold):
    '''
    return dataset into k pieces with equal length

    '''
    copy, fold_size = dataset[:], len(dataset)//k_fold
    return [[copy.pop(randrange(len(copy))) for _ in range(fold_size)] for i in range(k_fold)]

def evaluate(dataset, model, k_fold, max_depth = 8, min_size = 5):
    '''
    1. prepare dataset for K-Fold cross validation at each iteration
    2. record accuracy for each iteration in the list called acc
    '''
    folds,acc = cv(dataset, k_fold),[]
    for i in range(len(folds)):
        test_set,train_set = folds[i],[]
        for j in range(len(folds)):
            if j == i: continue           #this for loop is to complement the train set
            train_set += folds[j]
        tree = model(train_set, max_depth, min_size)
        y_p = predict(tree, test_set)
        accuracy = get_accuracy(get_y(test_set), y_p)
        acc.append(accuracy)
    return tree, acc

def _split(i, threshold, dataset):
    '''
    split dataset into two pieces given threshold and the index of feature
    '''
    left = [row for row in dataset if row[i] < threshold]
    right = [row for row in dataset if row[i] >= threshold]
    return left, right

def get_gini(left, right, classes):
    '''
    calculate gini
    '''
    n1, n2= len(left), len(right)
    n = n1 + n2
    gini_left = 0 if n1 == 0 else (1 - sum([([row[-1] for row in left].count(c)/n1)**2 for c in classes])) * (n1/n)  #sum([([row[-1] for row in left].count(c)/n1)**2 for c in classes])) is pk since above 6 is 1 while under is 0 which will not affect the count
    gini_right = 0 if n2 == 0 else (1 - sum([([row[-1] for row in right].count(c)/n2)**2 for c in classes])) * (n2/n)
    return gini_left + gini_right

def get_split(dataset):
    '''
    go through every feature in every row find the best cut that gives the most improvements based on gini
    '''
    classes = {row[-1] for row in dataset}
    _i, _threshold, _gini, _children = float('inf'), float('inf'), float('inf'), None
    for i in range(len(dataset[0])-1): # feature except the quality
        for row in dataset:
            children = _split(i, row[i], dataset)   #to check each value inside the ith feature in order to find the best cut
            left, right = children
            gini = get_gini(left, right, classes)
            if gini < _gini:
                _i, _threshold, _gini, _children = i, row[i], gini, children
    return {'i':_i, 'threshold':_threshold, 'children':_children}

def get_y(dataset):
    '''
    return the last column which is the y
    '''
    return [row[-1] for row in dataset]

def leaf(dataset):
    '''
    return the majority class in the given dataset
    '''
    y = get_y(dataset)
    return max(y, key = y.count)


def split(node, max_depth, min_size, depth):
    '''
    build the tree until three conditions are met:
    1. max depth
    2. min size in the leaf node
    3. only one class left
    '''
    left, right = node['children']
    if not left or not right:
        node['left'] = node['right'] = leaf(left + right)
        return
    if depth <= max_depth and len(left) >= min_size:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    else:
        node['left'] = leaf(left)
    if depth <= max_depth and len(right) >= min_size:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)
    else:
        node['right'] = leaf(right)

def fit(dataset, max_depth, min_size):
    '''
    build the tree and keep track of the root node
    '''
    root = get_split(dataset)
    split(root, max_depth, min_size, 1)
    return root


def _predict(node, row):
    '''
    for a row, go through the tree and return the class stored in the leaf
    '''
    if row[node['i']] < node['threshold']:
        return node['left'] if type(node['left']) == int else _predict(node['left'], row)
    else:
        return node['right'] if type(node['right']) == int else _predict(node['right'], row)

def model(train, max_depth, min_size):
    tree = fit(train, max_depth, min_size)
    return tree

def predict(tree, test):
    return [_predict(tree, row) for row in test]

'''
default value
'''
k_fold = 3
max_depth = 8
min_size = 5
trainPath = "train.csv"
testPath = "test.csv"

if len(sys.argv) > 1:
    k_fold = int(sys.argv[1])
if len(sys.argv) > 2:
    max_depth = int(sys.argv[2])
if len(sys.argv) > 3:
    min_size = int(sys.argv[3])
if len(sys.argv) > 5:
    trainPath = sys.argv[4]
    testPath = sys.argv[5]

train = load_csv(trainPath)
tree, acc = evaluate(train, model, k_fold, max_depth, min_size)
print('CV Accuracy: %s' % acc)
print('Mean Accuracy: %.3f%%' % (sum(acc)/k_fold))

test = load_csv(testPath)
y_p = predict(tree, test)
print("Test Accuracy: %.3f%%" % get_accuracy (get_y(test), y_p))