AdaBoostImplementation.R

#Kevin Song
#CS 373
#Pset 3 Question 6

#Collaborators: Dionna Jacobson, Ravina Jain

library(rpart)

#Import training data
training.features = read.table("TrainingFeatures.txt", header = FALSE)
training.labels = read.table("TrainingLabels.txt", header = FALSE)

training.data = cbind(training.labels, training.features)
attach(training.data)

colnames(training.data)[1] <- "Response"

#Import validation data
validation.features = read.table("ValidationFeatures.txt", header = FALSE)
validation.labels = read.table("validationLabels.txt", header = FALSE)

validation.data = cbind(validation.labels, validation.features)
attach(validation.data)

colnames(validation.data)[1] <- "Response"

#Import testing data
testing.features = read.table("TestingFeatures.txt", header = FALSE)
testing.labels = read.table("TestingLabels.txt", header = FALSE)

testing.data = cbind(testing.labels, testing.features)
attach(testing.labels)

colnames(testing.data)[1] <- "Response"

#Sources: http://www.cs.utsa.edu/~bylander/pubs/FLAIRS06-108.pdf
#http://ttic.uchicago.edu/~tewari/lectures/lecture6.pdf
#and Lecture 16 notes

#This function takes in a (training) data set, a validation set, and number of iterations,
#and outputs a final hypothesis based on that validation set.
DiscreteAdaboost <- function(dataset, validation.set, iterations)
{
  #Initialize training data weights
  data.weights <- rep(1/nrow(dataset), nrow(dataset))

  weak.learners <- vector(mode="list") #list of rpart objects, or "weak learners"
  classifier.weights = c() #alphas for each iteration

  for(i in 1:iterations)
  {
    #Create weak learner
    set.seed(0)

    tree.control = rpart.control(maxdepth = 1)
    tree = rpart(Response ~ ., data = dataset, control = tree.control, weights = data.weights)

    #Get predictions vector using weak learner
    predictions = unname(predict(tree, dataset, type = "vector"))

    #Transform predictions vector into {-1, 1}
    transformed.predictions = sign(predictions)

    #Reweight data weights
    epsilon = 0
    alpha = 0
    z.norm.constant = 0

    #Get epsilon value
    for(a in 1:length(transformed.predictions))
    {
      if(transformed.predictions[a] != dataset[a,1])
      {
        epsilon = epsilon + data.weights[a]
      }
    }

    #Get alpha value
    alpha = 0.5 * log((1 - epsilon) / epsilon)

    #Get normalization constant value
    for(b in 1:length(data.weights))
    {
      z.norm.constant = z.norm.constant + (data.weights[b] * exp(-alpha * dataset[b,1] * transformed.predictions[b]))
    }

    #Update distribution of data weights
    for(c in 1:length(data.weights))
    {
      data.weights[c] = (data.weights[c] * exp(-alpha * dataset[c, 1] * transformed.predictions[c]) / z.norm.constant)
    }

    weak.learners[[i]] <- tree
    classifier.weights <- c(classifier.weights, alpha)
  }

  print(weak.learners) #Obtain the rules for each iteration of the model.

  #Get final hypothesis

  output = 0

  for(d in 1:length(weak.learners))
  {
    weak.tree = weak.learners[[d]]

    hypothesis = unname(predict(weak.tree, validation.set, type = 'vector'))
    hypothesis = sign(hypothesis)

    output = output + hypothesis * classifier.weights[d]
  }

  print(classifier.weights) #Obtain the associated alphas of the model.

  return(sign(output))
}

#This function returns the accuracy plot, given training set, validation set, and number of iterations.
getAccuracyPlot <- function(dataset, validation.set, iterations)
{
  #Initialize training data weights
  data.weights <- rep(1/nrow(dataset), nrow(dataset))

  weak.learners <- vector(mode="list") #list of rpart objects, or "weak learners"
  classifier.weights = c() #alphas for each iteration

  for(i in 1:iterations)
  {
    #Create weak learner
    set.seed(0)

    tree.control = rpart.control(maxdepth = 1)
    tree = rpart(Response ~ ., data = dataset, control = tree.control, weights = data.weights)

    #Get predictions vector using weak learner
    predictions = unname(predict(tree, dataset, type = "vector"))

    #Transform predictions vector into {-1, 1}
    transformed.predictions = sign(predictions)

    #Reweight data weights
    epsilon = 0
    alpha = 0
    z.norm.constant = 0

    #Get epsilon value
    for(a in 1:length(transformed.predictions))
    {
      if(transformed.predictions[a] != dataset[a,1])
      {
        epsilon = epsilon + data.weights[a]
      }
    }

    #Get alpha value
    alpha = 0.5 * log((1 - epsilon) / epsilon)

    #Get normalization constant value
    for(b in 1:length(data.weights))
    {
      z.norm.constant = z.norm.constant + (data.weights[b] * exp(-alpha * dataset[b,1] * transformed.predictions[b]))
    }

    #Update distribution of data weights
    for(c in 1:length(data.weights))
    {
      data.weights[c] = (data.weights[c] * exp(-alpha * dataset[c, 1] * transformed.predictions[c]) / z.norm.constant)
    }

    weak.learners[[i]] <- tree
    classifier.weights <- c(classifier.weights, alpha)
  }

  #Get predictive accuracies after each boosting iteration, on the training set.

  training.prediction = 0
  training.accuracies = c()

  for(d in 1:length(weak.learners))
  {
    for(e in 1:d)
    {
      weak.tree = weak.learners[[e]]
      train.hypothesis = unname(predict(weak.tree, dataset, type = 'vector'))
      train.hypothesis = sign(train.hypothesis)

      training.prediction = training.prediction + train.hypothesis * classifier.weights[e]
    }

    training.prediction = sign(training.prediction)

    training.accuracies <- c(training.accuracies, getAccuracy(training.prediction, dataset))

    training.prediction = 0
  }

  #Get predictive accuracies after each boosting iteration, on the validation set.

  validation.prediction = 0
  validation.accuracies = c()

  for(d in 1:length(weak.learners))
  {
    for(e in 1:d)
    {
      weak.tree = weak.learners[[e]]
      valid.hypothesis = unname(predict(weak.tree, validation.set, type = 'vector'))
      valid.hypothesis = sign(valid.hypothesis)

      validation.prediction = validation.prediction + valid.hypothesis * classifier.weights[e]
    }

    validation.prediction = sign(validation.prediction)

    validation.accuracies <- c(validation.accuracies, getAccuracy(validation.prediction, validation.set))

    validation.prediction = 0
  }

  #Plot accuracies of boosted model on training and validation data at every iteration:

  indices = seq.int(1, iterations)

  df <- data.frame(indices, training.accuracies, validation.accuracies)

  library(ggplot2)

  ggplot(df, aes(indices)) +
    geom_line(aes(y = training.accuracies, color = "Training accuracy")) +
    geom_line(aes(y = validation.accuracies, color = "Validation accuracy")) +
    ggtitle("Classification accuracy of boosted model, per iteration") +
    labs(x = "Boosting iteration", y = "Classfication accuracy")

  #print(which.max(validation.accuracies))
}

#This function returns the accuracy rate, given a prediction vector and a dataset for comparison.
getAccuracy <- function(prediction.vector, dataset)
{
  numCorrect = 0

  for(i in 1:length(prediction.vector))
  {
    if(prediction.vector[i] == dataset[i,1])
    {
      numCorrect = numCorrect + 1
    }
  }

  return(numCorrect / length(prediction.vector))
}

#Problem 6b.

getAccuracyPlot(training.data, validation.data, 20)

#Problem 6c.

#The ideal boosted model is constructed from 9 iterations. This maximizes the model's accuracy on the validation data.
#We see evidence of overfitting due to more boosting iterations, as our classification accuracy on the validation data drops
#off after 9 iterations.

#As the iterations increase, the model better fits the training data, as indicated by the red line.
#However, this trend is not true for the validation data, due to the increased risk of overfitting on the training data, such that
#it cannot be generalized to the validation data.

#Problem 6d.

#See lines 86 and 102.

DiscreteAdaboost(training.data, validation.data, 10)

#1st rule: Split on V14 = 0.00137654, alpha = 1.0235397, feature = COREST
#2nd rule: Split on V29 = 0.09152545, alpha = 0.4967663, feature = H3K27AC
#3rd rule: Split on V32 = 0.0102931, alpha = 0.3287472, feature = H3K4ME1
#4th rule: Split on V35 = 0.0368491, alpha = 0.4540716, feature = H3K79ME2
#5th rule: Split on V51 = 0.118309, alpha = 0.3781959, feature = NFIC
#6th rule: Split on V60 = 0.003321625, alpha = 0.2842413, feature = PHASTCONS
#7th rule: Split on V33 = 0.123807, alpha = 0.2300260, feature = H3K4ME2
#8th rule: Split on V34 = 0.07069245, alpha = 0.2975442, feature = H3K4ME3
#9th rule: Split on V14 = 0.00993149, alpha = 0.2308955, feature = CPGISLANDS
#10th rule: Split on V18 = 0.00993149, alpha = 0.2484172, feature = EBF1

#Problem 6e.

test.predictions = DiscreteAdaboost(training.data, testing.data, 9)

getAccuracy(test.predictions, testing.data)

#Our validated AdaBoost model has a classification accuracy of 89.84% on the test dataset.

#Problem 6f.

library(pROC)

roc = roc(testing.data$Response, test.predictions)
print(roc)
plot(roc)

#AUC = 0.8984