Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Kevin Song
- #CS 373
- #Pset 3 Question 6
- #Collaborators: Dionna Jacobson, Ravina Jain
- library(rpart)
- #Import training data
- training.features = read.table("TrainingFeatures.txt", header = FALSE)
- training.labels = read.table("TrainingLabels.txt", header = FALSE)
- training.data = cbind(training.labels, training.features)
- attach(training.data)
- colnames(training.data)[1] <- "Response"
- #Import validation data
- validation.features = read.table("ValidationFeatures.txt", header = FALSE)
- validation.labels = read.table("validationLabels.txt", header = FALSE)
- validation.data = cbind(validation.labels, validation.features)
- attach(validation.data)
- colnames(validation.data)[1] <- "Response"
- #Import testing data
- testing.features = read.table("TestingFeatures.txt", header = FALSE)
- testing.labels = read.table("TestingLabels.txt", header = FALSE)
- testing.data = cbind(testing.labels, testing.features)
- attach(testing.labels)
- colnames(testing.data)[1] <- "Response"
- #Sources: http://www.cs.utsa.edu/~bylander/pubs/FLAIRS06-108.pdf
- #http://ttic.uchicago.edu/~tewari/lectures/lecture6.pdf
- #and Lecture 16 notes
- #This function takes in a (training) data set, a validation set, and number of iterations,
- #and outputs a final hypothesis based on that validation set.
- DiscreteAdaboost <- function(dataset, validation.set, iterations)
- {
- #Initialize training data weights
- data.weights <- rep(1/nrow(dataset), nrow(dataset))
- weak.learners <- vector(mode="list") #list of rpart objects, or "weak learners"
- classifier.weights = c() #alphas for each iteration
- for(i in 1:iterations)
- {
- #Create weak learner
- set.seed(0)
- tree.control = rpart.control(maxdepth = 1)
- tree = rpart(Response ~ ., data = dataset, control = tree.control, weights = data.weights)
- #Get predictions vector using weak learner
- predictions = unname(predict(tree, dataset, type = "vector"))
- #Transform predictions vector into {-1, 1}
- transformed.predictions = sign(predictions)
- #Reweight data weights
- epsilon = 0
- alpha = 0
- z.norm.constant = 0
- #Get epsilon value
- for(a in 1:length(transformed.predictions))
- {
- if(transformed.predictions[a] != dataset[a,1])
- {
- epsilon = epsilon + data.weights[a]
- }
- }
- #Get alpha value
- alpha = 0.5 * log((1 - epsilon) / epsilon)
- #Get normalization constant value
- for(b in 1:length(data.weights))
- {
- z.norm.constant = z.norm.constant + (data.weights[b] * exp(-alpha * dataset[b,1] * transformed.predictions[b]))
- }
- #Update distribution of data weights
- for(c in 1:length(data.weights))
- {
- data.weights[c] = (data.weights[c] * exp(-alpha * dataset[c, 1] * transformed.predictions[c]) / z.norm.constant)
- }
- weak.learners[[i]] <- tree
- classifier.weights <- c(classifier.weights, alpha)
- }
- print(weak.learners) #Obtain the rules for each iteration of the model.
- #Get final hypothesis
- output = 0
- for(d in 1:length(weak.learners))
- {
- weak.tree = weak.learners[[d]]
- hypothesis = unname(predict(weak.tree, validation.set, type = 'vector'))
- hypothesis = sign(hypothesis)
- output = output + hypothesis * classifier.weights[d]
- }
- print(classifier.weights) #Obtain the associated alphas of the model.
- return(sign(output))
- }
- #This function returns the accuracy plot, given training set, validation set, and number of iterations.
- getAccuracyPlot <- function(dataset, validation.set, iterations)
- {
- #Initialize training data weights
- data.weights <- rep(1/nrow(dataset), nrow(dataset))
- weak.learners <- vector(mode="list") #list of rpart objects, or "weak learners"
- classifier.weights = c() #alphas for each iteration
- for(i in 1:iterations)
- {
- #Create weak learner
- set.seed(0)
- tree.control = rpart.control(maxdepth = 1)
- tree = rpart(Response ~ ., data = dataset, control = tree.control, weights = data.weights)
- #Get predictions vector using weak learner
- predictions = unname(predict(tree, dataset, type = "vector"))
- #Transform predictions vector into {-1, 1}
- transformed.predictions = sign(predictions)
- #Reweight data weights
- epsilon = 0
- alpha = 0
- z.norm.constant = 0
- #Get epsilon value
- for(a in 1:length(transformed.predictions))
- {
- if(transformed.predictions[a] != dataset[a,1])
- {
- epsilon = epsilon + data.weights[a]
- }
- }
- #Get alpha value
- alpha = 0.5 * log((1 - epsilon) / epsilon)
- #Get normalization constant value
- for(b in 1:length(data.weights))
- {
- z.norm.constant = z.norm.constant + (data.weights[b] * exp(-alpha * dataset[b,1] * transformed.predictions[b]))
- }
- #Update distribution of data weights
- for(c in 1:length(data.weights))
- {
- data.weights[c] = (data.weights[c] * exp(-alpha * dataset[c, 1] * transformed.predictions[c]) / z.norm.constant)
- }
- weak.learners[[i]] <- tree
- classifier.weights <- c(classifier.weights, alpha)
- }
- #Get predictive accuracies after each boosting iteration, on the training set.
- training.prediction = 0
- training.accuracies = c()
- for(d in 1:length(weak.learners))
- {
- for(e in 1:d)
- {
- weak.tree = weak.learners[[e]]
- train.hypothesis = unname(predict(weak.tree, dataset, type = 'vector'))
- train.hypothesis = sign(train.hypothesis)
- training.prediction = training.prediction + train.hypothesis * classifier.weights[e]
- }
- training.prediction = sign(training.prediction)
- training.accuracies <- c(training.accuracies, getAccuracy(training.prediction, dataset))
- training.prediction = 0
- }
- #Get predictive accuracies after each boosting iteration, on the validation set.
- validation.prediction = 0
- validation.accuracies = c()
- for(d in 1:length(weak.learners))
- {
- for(e in 1:d)
- {
- weak.tree = weak.learners[[e]]
- valid.hypothesis = unname(predict(weak.tree, validation.set, type = 'vector'))
- valid.hypothesis = sign(valid.hypothesis)
- validation.prediction = validation.prediction + valid.hypothesis * classifier.weights[e]
- }
- validation.prediction = sign(validation.prediction)
- validation.accuracies <- c(validation.accuracies, getAccuracy(validation.prediction, validation.set))
- validation.prediction = 0
- }
- #Plot accuracies of boosted model on training and validation data at every iteration:
- indices = seq.int(1, iterations)
- df <- data.frame(indices, training.accuracies, validation.accuracies)
- library(ggplot2)
- ggplot(df, aes(indices)) +
- geom_line(aes(y = training.accuracies, color = "Training accuracy")) +
- geom_line(aes(y = validation.accuracies, color = "Validation accuracy")) +
- ggtitle("Classification accuracy of boosted model, per iteration") +
- labs(x = "Boosting iteration", y = "Classfication accuracy")
- #print(which.max(validation.accuracies))
- }
- #This function returns the accuracy rate, given a prediction vector and a dataset for comparison.
- getAccuracy <- function(prediction.vector, dataset)
- {
- numCorrect = 0
- for(i in 1:length(prediction.vector))
- {
- if(prediction.vector[i] == dataset[i,1])
- {
- numCorrect = numCorrect + 1
- }
- }
- return(numCorrect / length(prediction.vector))
- }
- #Problem 6b.
- getAccuracyPlot(training.data, validation.data, 20)
- #Problem 6c.
- #The ideal boosted model is constructed from 9 iterations. This maximizes the model's accuracy on the validation data.
- #We see evidence of overfitting due to more boosting iterations, as our classification accuracy on the validation data drops
- #off after 9 iterations.
- #As the iterations increase, the model better fits the training data, as indicated by the red line.
- #However, this trend is not true for the validation data, due to the increased risk of overfitting on the training data, such that
- #it cannot be generalized to the validation data.
- #Problem 6d.
- #See lines 86 and 102.
- DiscreteAdaboost(training.data, validation.data, 10)
- #1st rule: Split on V14 = 0.00137654, alpha = 1.0235397, feature = COREST
- #2nd rule: Split on V29 = 0.09152545, alpha = 0.4967663, feature = H3K27AC
- #3rd rule: Split on V32 = 0.0102931, alpha = 0.3287472, feature = H3K4ME1
- #4th rule: Split on V35 = 0.0368491, alpha = 0.4540716, feature = H3K79ME2
- #5th rule: Split on V51 = 0.118309, alpha = 0.3781959, feature = NFIC
- #6th rule: Split on V60 = 0.003321625, alpha = 0.2842413, feature = PHASTCONS
- #7th rule: Split on V33 = 0.123807, alpha = 0.2300260, feature = H3K4ME2
- #8th rule: Split on V34 = 0.07069245, alpha = 0.2975442, feature = H3K4ME3
- #9th rule: Split on V14 = 0.00993149, alpha = 0.2308955, feature = CPGISLANDS
- #10th rule: Split on V18 = 0.00993149, alpha = 0.2484172, feature = EBF1
- #Problem 6e.
- test.predictions = DiscreteAdaboost(training.data, testing.data, 9)
- getAccuracy(test.predictions, testing.data)
- #Our validated AdaBoost model has a classification accuracy of 89.84% on the test dataset.
- #Problem 6f.
- library(pROC)
- roc = roc(testing.data$Response, test.predictions)
- print(roc)
- plot(roc)
- #AUC = 0.8984
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement