Advertisement
Fenny_Theo

csc1001 group project part4

May 16th, 2020
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.69 KB | None | 0 0
  1. import sys
  2. from random import randrange
  3.  
  4. trainPath = "train.csv"
  5. testPath = "test.csv"
  6.  
  7. def load_csv(path):
  8. '''
  9. load dataset from csv
  10. @dataset in this format:
  11. [
  12. [x1, y1],
  13. [x2, y2],
  14. [x3, y3]
  15. ]
  16. here x1 is the 11 features in this cases
  17. '''
  18. dataset,header = [],[]
  19. with open(path,"r") as f:
  20. header=[h.strip() for h in f.readline().split(",")]
  21. for line in f:
  22. dataset.append([float(feature.strip()) for feature in line.split(",")])
  23. dataset = [row[:-1]+[1] if row[-1] > 6 else row[:-1]+[0] for row in dataset]#change the last row into 1 or zero
  24. return dataset
  25.  
  26. def get_accuracy(y, y_p):
  27. '''
  28. return the accuracy given actual y and the predicted y (y_p)
  29. '''
  30. return sum(a == b for a, b in zip(y, y_p))/len(y) * 100
  31.  
  32. def cv(dataset, k_fold):
  33. '''
  34. return dataset into k pieces with equal length
  35.  
  36. '''
  37. copy, fold_size = dataset[:], len(dataset)//k_fold
  38. return [[copy.pop(randrange(len(copy))) for _ in range(fold_size)] for i in range(k_fold)]
  39.  
  40. def evaluate(dataset, model, k_fold, max_depth = 8, min_size = 5):
  41. '''
  42. 1. prepare dataset for K-Fold cross validation at each iteration
  43. 2. record accuracy for each iteration in the list called acc
  44. '''
  45. folds,acc = cv(dataset, k_fold),[]
  46. for i in range(len(folds)):
  47. test_set,train_set = folds[i],[]
  48. for j in range(len(folds)):
  49. if j == i: continue #this for loop is to complement the train set
  50. train_set += folds[j]
  51. tree = model(train_set, max_depth, min_size)
  52. y_p = predict(tree, test_set)
  53. accuracy = get_accuracy(get_y(test_set), y_p)
  54. acc.append(accuracy)
  55. return tree, acc
  56.  
  57. def _split(i, threshold, dataset):
  58. '''
  59. split dataset into two pieces given threshold and the index of feature
  60. '''
  61. left = [row for row in dataset if row[i] < threshold]
  62. right = [row for row in dataset if row[i] >= threshold]
  63. return left, right
  64.  
  65. def get_gini(left, right, classes):
  66. '''
  67. calculate gini
  68. '''
  69. n1, n2= len(left), len(right)
  70. n = n1 + n2
  71. gini_left = 0 if n1 == 0 else (1 - sum([([row[-1] for row in left].count(c)/n1)**2 for c in classes])) * (n1/n) #sum([([row[-1] for row in left].count(c)/n1)**2 for c in classes])) is pk since above 6 is 1 while under is 0 which will not affect the count
  72. gini_right = 0 if n2 == 0 else (1 - sum([([row[-1] for row in right].count(c)/n2)**2 for c in classes])) * (n2/n)
  73. return gini_left + gini_right
  74.  
  75. def get_split(dataset):
  76. '''
  77. go through every feature in every row find the best cut that gives the most improvements based on gini
  78. '''
  79. classes = {row[-1] for row in dataset}
  80. _i, _threshold, _gini, _children = float('inf'), float('inf'), float('inf'), None
  81. for i in range(len(dataset[0])-1): # feature except the quality
  82. for row in dataset:
  83. children = _split(i, row[i], dataset) #to check each value inside the ith feature in order to find the best cut
  84. left, right = children
  85. gini = get_gini(left, right, classes)
  86. if gini < _gini:
  87. _i, _threshold, _gini, _children = i, row[i], gini, children
  88. return {'i':_i, 'threshold':_threshold, 'children':_children}
  89.  
  90. def get_y(dataset):
  91. '''
  92. return the last column which is the y
  93. '''
  94. return [row[-1] for row in dataset]
  95.  
  96. def leaf(dataset):
  97. '''
  98. return the majority class in the given dataset
  99. '''
  100. y = get_y(dataset)
  101. return max(y, key = y.count)
  102.  
  103.  
  104. def split(node, max_depth, min_size, depth):
  105. '''
  106. build the tree until three conditions are met:
  107. 1. max depth
  108. 2. min size in the leaf node
  109. 3. only one class left
  110. '''
  111. left, right = node['children']
  112. if not left or not right:
  113. node['left'] = node['right'] = leaf(left + right)
  114. return
  115. if depth <= max_depth and len(left) >= min_size:
  116. node['left'] = get_split(left)
  117. split(node['left'], max_depth, min_size, depth+1)
  118. else:
  119. node['left'] = leaf(left)
  120. if depth <= max_depth and len(right) >= min_size:
  121. node['right'] = get_split(right)
  122. split(node['right'], max_depth, min_size, depth+1)
  123. else:
  124. node['right'] = leaf(right)
  125.  
  126. def fit(dataset, max_depth, min_size):
  127. '''
  128. build the tree and keep track of the root node
  129. '''
  130. root = get_split(dataset)
  131. split(root, max_depth, min_size, 1)
  132. return root
  133.  
  134.  
  135. def _predict(node, row):
  136. '''
  137. for a row, go through the tree and return the class stored in the leaf
  138. '''
  139. if row[node['i']] < node['threshold']:
  140. return node['left'] if type(node['left']) == int else _predict(node['left'], row)
  141. else:
  142. return node['right'] if type(node['right']) == int else _predict(node['right'], row)
  143.  
  144. def model(train, max_depth, min_size):
  145. tree = fit(train, max_depth, min_size)
  146. return tree
  147.  
  148. def predict(tree, test):
  149. return [_predict(tree, row) for row in test]
  150.  
  151. '''
  152. default value
  153. '''
  154. k_fold = 3
  155. max_depth = 8
  156. min_size = 5
  157. trainPath = "train.csv"
  158. testPath = "test.csv"
  159.  
  160. if len(sys.argv) > 1:
  161. k_fold = int(sys.argv[1])
  162. if len(sys.argv) > 2:
  163. max_depth = int(sys.argv[2])
  164. if len(sys.argv) > 3:
  165. min_size = int(sys.argv[3])
  166. if len(sys.argv) > 5:
  167. trainPath = sys.argv[4]
  168. testPath = sys.argv[5]
  169.  
  170. train = load_csv(trainPath)
  171. tree, acc = evaluate(train, model, k_fold, max_depth, min_size)
  172. print('CV Accuracy: %s' % acc)
  173. print('Mean Accuracy: %.3f%%' % (sum(acc)/k_fold))
  174.  
  175. test = load_csv(testPath)
  176. y_p = predict(tree, test)
  177. print("Test Accuracy: %.3f%%" % get_accuracy (get_y(test), y_p))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement