Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #read in the data with no column headers
- theData = read.csv("40124332_train.csv", header=F)
- #we are using 5 folds for this cross validation
- kfoldsk = 5
- theData <- theData[sample(nrow(theData)),]
- #cut data into 5 folds
- theData$folds <- cut(seq(1,nrow(theData)),breaks=kfoldsk,labels=FALSE)
- #assign column names to our 4 features and label
- colnames(theData) = c("f1", "f2", "f3", "f4", "f5")
- #we are using the first 4 features in this task
- features = c("f2","f3","f4","f5")
- #the k values
- knnk = c(1,3,5,9,17,33)
- #loop over the folds for every k value
- for(i in knnk){
- print("K:")
- print(i)
- for(j in 1:kfoldsk){
- #split data into training and validation sets
- train_items = theData[theData$folds != j,]
- validation_items = theData[theData$folds == j,]
- # fit knn model on this fold
- #using training data and validation data
- #we are predicting what digit it is i.e the first column in the data
- predictions = knn(train_items[,features], validation_items[,features], train_items$f1, k=i)
- correct_list = predictions == validation_items$f1
- nr_correct = nrow(validation_items[correct_list,])
- acc_rate = nr_correct/nrow(validation_items)
- print(acc_rate)
- }
- }
- #obtain the most confusable digits from a k value of 3
- for(i in 1:kfoldsk){
- train_items = theData[theData$folds != i,]
- validation_items = theData[theData$folds == i,]
- # fit knn model on this fold
- predictions = knn(train_items[,features], validation_items[,features], train_items$f1, k=3)
- }
- #prints a 10x10 matrix of the guesses
- table(predictions, validation_items$f1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement