Advertisement
Guest User

AdaBoostImplementation.R

a guest
Oct 20th, 2017
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 9.29 KB | None | 0 0
  1. #Kevin Song
  2. #CS 373
  3. #Pset 3 Question 6
  4.  
  5. #Collaborators: Dionna Jacobson, Ravina Jain
  6.  
  7. library(rpart)
  8.  
  9. #Import training data
  10. training.features = read.table("TrainingFeatures.txt", header = FALSE)
  11. training.labels = read.table("TrainingLabels.txt", header = FALSE)
  12.  
  13. training.data = cbind(training.labels, training.features)
  14. attach(training.data)
  15.  
  16. colnames(training.data)[1] <- "Response"
  17.  
  18. #Import validation data
  19. validation.features = read.table("ValidationFeatures.txt", header = FALSE)
  20. validation.labels = read.table("validationLabels.txt", header = FALSE)
  21.  
  22. validation.data = cbind(validation.labels, validation.features)
  23. attach(validation.data)
  24.  
  25. colnames(validation.data)[1] <- "Response"
  26.  
  27. #Import testing data
  28. testing.features = read.table("TestingFeatures.txt", header = FALSE)
  29. testing.labels = read.table("TestingLabels.txt", header = FALSE)
  30.  
  31. testing.data = cbind(testing.labels, testing.features)
  32. attach(testing.labels)
  33.  
  34. colnames(testing.data)[1] <- "Response"
  35.  
  36. #Sources: http://www.cs.utsa.edu/~bylander/pubs/FLAIRS06-108.pdf
  37. #http://ttic.uchicago.edu/~tewari/lectures/lecture6.pdf
  38. #and Lecture 16 notes
  39.  
  40. #This function takes in a (training) data set, a validation set, and number of iterations,
  41. #and outputs a final hypothesis based on that validation set.
  42. DiscreteAdaboost <- function(dataset, validation.set, iterations)
  43. {
  44.   #Initialize training data weights
  45.   data.weights <- rep(1/nrow(dataset), nrow(dataset))
  46.  
  47.   weak.learners <- vector(mode="list") #list of rpart objects, or "weak learners"
  48.   classifier.weights = c() #alphas for each iteration
  49.  
  50.   for(i in 1:iterations)
  51.   {
  52.     #Create weak learner
  53.     set.seed(0)
  54.    
  55.     tree.control = rpart.control(maxdepth = 1)
  56.     tree = rpart(Response ~ ., data = dataset, control = tree.control, weights = data.weights)
  57.    
  58.     #Get predictions vector using weak learner
  59.     predictions = unname(predict(tree, dataset, type = "vector"))
  60.    
  61.     #Transform predictions vector into {-1, 1}
  62.     transformed.predictions = sign(predictions)
  63.    
  64.     #Reweight data weights
  65.     epsilon = 0
  66.     alpha = 0
  67.     z.norm.constant = 0
  68.    
  69.     #Get epsilon value
  70.     for(a in 1:length(transformed.predictions))
  71.     {
  72.       if(transformed.predictions[a] != dataset[a,1])
  73.       {
  74.         epsilon = epsilon + data.weights[a]
  75.       }
  76.     }
  77.  
  78.     #Get alpha value
  79.     alpha = 0.5 * log((1 - epsilon) / epsilon)
  80.    
  81.     #Get normalization constant value
  82.     for(b in 1:length(data.weights))
  83.     {
  84.       z.norm.constant = z.norm.constant + (data.weights[b] * exp(-alpha * dataset[b,1] * transformed.predictions[b]))
  85.     }
  86.    
  87.     #Update distribution of data weights
  88.     for(c in 1:length(data.weights))
  89.     {
  90.       data.weights[c] = (data.weights[c] * exp(-alpha * dataset[c, 1] * transformed.predictions[c]) / z.norm.constant)
  91.     }
  92.  
  93.     weak.learners[[i]] <- tree
  94.     classifier.weights <- c(classifier.weights, alpha)
  95.   }
  96.  
  97.   print(weak.learners) #Obtain the rules for each iteration of the model.
  98.  
  99.   #Get final hypothesis
  100.  
  101.   output = 0
  102.  
  103.   for(d in 1:length(weak.learners))
  104.   {
  105.     weak.tree = weak.learners[[d]]
  106.    
  107.     hypothesis = unname(predict(weak.tree, validation.set, type = 'vector'))
  108.     hypothesis = sign(hypothesis)
  109.    
  110.     output = output + hypothesis * classifier.weights[d]
  111.   }
  112.  
  113.   print(classifier.weights) #Obtain the associated alphas of the model.
  114.  
  115.   return(sign(output))
  116. }
  117.  
  118. #This function returns the accuracy plot, given training set, validation set, and number of iterations.
  119. getAccuracyPlot <- function(dataset, validation.set, iterations)
  120. {
  121.   #Initialize training data weights
  122.   data.weights <- rep(1/nrow(dataset), nrow(dataset))
  123.  
  124.   weak.learners <- vector(mode="list") #list of rpart objects, or "weak learners"
  125.   classifier.weights = c() #alphas for each iteration
  126.  
  127.   for(i in 1:iterations)
  128.   {
  129.     #Create weak learner
  130.     set.seed(0)
  131.    
  132.     tree.control = rpart.control(maxdepth = 1)
  133.     tree = rpart(Response ~ ., data = dataset, control = tree.control, weights = data.weights)
  134.    
  135.     #Get predictions vector using weak learner
  136.     predictions = unname(predict(tree, dataset, type = "vector"))
  137.    
  138.     #Transform predictions vector into {-1, 1}
  139.     transformed.predictions = sign(predictions)
  140.    
  141.     #Reweight data weights
  142.     epsilon = 0
  143.     alpha = 0
  144.     z.norm.constant = 0
  145.    
  146.     #Get epsilon value
  147.     for(a in 1:length(transformed.predictions))
  148.     {
  149.       if(transformed.predictions[a] != dataset[a,1])
  150.       {
  151.         epsilon = epsilon + data.weights[a]
  152.       }
  153.     }
  154.    
  155.     #Get alpha value
  156.     alpha = 0.5 * log((1 - epsilon) / epsilon)
  157.    
  158.     #Get normalization constant value
  159.     for(b in 1:length(data.weights))
  160.     {
  161.       z.norm.constant = z.norm.constant + (data.weights[b] * exp(-alpha * dataset[b,1] * transformed.predictions[b]))
  162.     }
  163.    
  164.     #Update distribution of data weights
  165.     for(c in 1:length(data.weights))
  166.     {
  167.       data.weights[c] = (data.weights[c] * exp(-alpha * dataset[c, 1] * transformed.predictions[c]) / z.norm.constant)
  168.     }
  169.    
  170.     weak.learners[[i]] <- tree
  171.     classifier.weights <- c(classifier.weights, alpha)
  172.   }
  173.  
  174.   #Get predictive accuracies after each boosting iteration, on the training set.
  175.  
  176.   training.prediction = 0
  177.   training.accuracies = c()
  178.  
  179.   for(d in 1:length(weak.learners))
  180.   {
  181.     for(e in 1:d)
  182.     {
  183.       weak.tree = weak.learners[[e]]
  184.       train.hypothesis = unname(predict(weak.tree, dataset, type = 'vector'))
  185.       train.hypothesis = sign(train.hypothesis)
  186.      
  187.       training.prediction = training.prediction + train.hypothesis * classifier.weights[e]
  188.     }
  189.    
  190.     training.prediction = sign(training.prediction)
  191.    
  192.     training.accuracies <- c(training.accuracies, getAccuracy(training.prediction, dataset))
  193.    
  194.     training.prediction = 0
  195.   }
  196.  
  197.   #Get predictive accuracies after each boosting iteration, on the validation set.
  198.  
  199.   validation.prediction = 0
  200.   validation.accuracies = c()
  201.  
  202.   for(d in 1:length(weak.learners))
  203.   {
  204.     for(e in 1:d)
  205.     {
  206.       weak.tree = weak.learners[[e]]
  207.       valid.hypothesis = unname(predict(weak.tree, validation.set, type = 'vector'))
  208.       valid.hypothesis = sign(valid.hypothesis)
  209.      
  210.       validation.prediction = validation.prediction + valid.hypothesis * classifier.weights[e]
  211.     }
  212.    
  213.     validation.prediction = sign(validation.prediction)
  214.    
  215.     validation.accuracies <- c(validation.accuracies, getAccuracy(validation.prediction, validation.set))
  216.    
  217.     validation.prediction = 0
  218.   }
  219.  
  220.   #Plot accuracies of boosted model on training and validation data at every iteration:
  221.  
  222.   indices = seq.int(1, iterations)
  223.  
  224.   df <- data.frame(indices, training.accuracies, validation.accuracies)
  225.  
  226.   library(ggplot2)
  227.  
  228.   ggplot(df, aes(indices)) +
  229.     geom_line(aes(y = training.accuracies, color = "Training accuracy")) +
  230.     geom_line(aes(y = validation.accuracies, color = "Validation accuracy")) +
  231.     ggtitle("Classification accuracy of boosted model, per iteration") +
  232.     labs(x = "Boosting iteration", y = "Classfication accuracy")
  233.  
  234.   #print(which.max(validation.accuracies))
  235. }
  236.  
  237. #This function returns the accuracy rate, given a prediction vector and a dataset for comparison.
  238. getAccuracy <- function(prediction.vector, dataset)
  239. {
  240.   numCorrect = 0
  241.  
  242.   for(i in 1:length(prediction.vector))
  243.   {
  244.     if(prediction.vector[i] == dataset[i,1])
  245.     {
  246.       numCorrect = numCorrect + 1
  247.     }
  248.   }
  249.  
  250.   return(numCorrect / length(prediction.vector))
  251. }
  252.  
  253. #Problem 6b.
  254.  
  255. getAccuracyPlot(training.data, validation.data, 20)
  256.  
  257. #Problem 6c.
  258.  
  259. #The ideal boosted model is constructed from 9 iterations. This maximizes the model's accuracy on the validation data.
  260. #We see evidence of overfitting due to more boosting iterations, as our classification accuracy on the validation data drops
  261. #off after 9 iterations.
  262.  
  263. #As the iterations increase, the model better fits the training data, as indicated by the red line.
  264. #However, this trend is not true for the validation data, due to the increased risk of overfitting on the training data, such that
  265. #it cannot be generalized to the validation data.
  266.  
  267. #Problem 6d.
  268.  
  269. #See lines 86 and 102.
  270.  
  271. DiscreteAdaboost(training.data, validation.data, 10)
  272.  
  273. #1st rule: Split on V14 = 0.00137654, alpha = 1.0235397, feature = COREST
  274. #2nd rule: Split on V29 = 0.09152545, alpha = 0.4967663, feature = H3K27AC
  275. #3rd rule: Split on V32 = 0.0102931, alpha = 0.3287472, feature = H3K4ME1
  276. #4th rule: Split on V35 = 0.0368491, alpha = 0.4540716, feature = H3K79ME2
  277. #5th rule: Split on V51 = 0.118309, alpha = 0.3781959, feature = NFIC
  278. #6th rule: Split on V60 = 0.003321625, alpha = 0.2842413, feature = PHASTCONS
  279. #7th rule: Split on V33 = 0.123807, alpha = 0.2300260, feature = H3K4ME2
  280. #8th rule: Split on V34 = 0.07069245, alpha = 0.2975442, feature = H3K4ME3
  281. #9th rule: Split on V14 = 0.00993149, alpha = 0.2308955, feature = CPGISLANDS
  282. #10th rule: Split on V18 = 0.00993149, alpha = 0.2484172, feature = EBF1
  283.  
  284. #Problem 6e.
  285.  
  286. test.predictions = DiscreteAdaboost(training.data, testing.data, 9)
  287.  
  288. getAccuracy(test.predictions, testing.data)
  289.  
  290. #Our validated AdaBoost model has a classification accuracy of 89.84% on the test dataset.
  291.  
  292. #Problem 6f.
  293.  
  294. library(pROC)
  295.  
  296. roc = roc(testing.data$Response, test.predictions)
  297. print(roc)
  298. plot(roc)
  299.  
  300. #AUC = 0.8984
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement