Untitled

#read in the data with no column headers
theData = read.csv("40124332_train.csv", header=F)

#we are using 5 folds for this cross validation
kfoldsk = 5
theData <- theData[sample(nrow(theData)),]

#cut data into 5 folds
theData$folds <- cut(seq(1,nrow(theData)),breaks=kfoldsk,labels=FALSE)

#assign column names to our 4 features and label
colnames(theData) = c("f1", "f2", "f3", "f4", "f5")

#we are using the first 4 features in this task
features = c("f2","f3","f4","f5")

#the k values
knnk = c(1,3,5,9,17,33)
#loop over the folds for every k value
for(i in knnk){
	print("K:")
	print(i)
	for(j in 1:kfoldsk){
		#split data into training and validation sets
		train_items  = theData[theData$folds != j,]
		validation_items = theData[theData$folds == j,]

		# fit knn model on this fold
		#using training data and validation data
		#we are predicting what digit it is i.e the first column in the data
		predictions = knn(train_items[,features], validation_items[,features], train_items$f1, k=i)

		correct_list = predictions == validation_items$f1
		nr_correct = nrow(validation_items[correct_list,])

		acc_rate = nr_correct/nrow(validation_items)
		print(acc_rate)
	}
}

#obtain the most confusable digits from a k value of 3
for(i in 1:kfoldsk){
	train_items  = theData[theData$folds != i,]
	validation_items = theData[theData$folds == i,]
	# fit knn model on this fold
	predictions = knn(train_items[,features], validation_items[,features], train_items$f1, k=3)
}
#prints a 10x10 matrix of the guesses
table(predictions, validation_items$f1)