Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---
- title: "Inclass 11-08"
- author: "Nino & Theo"
- date: "8 novembre 2018"
- output: html_document
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = TRUE)
- ```
- ```{r}
- spor <- as.data.frame(read.table("spor.csv", header = T, sep = ","))
- spor$Medu <- as.factor(spor$Medu)
- levels(spor$Medu) <- c("none", "4th grade", "5th to 9th grade", "secondary eduction", "higher education")
- spor$Fedu <- as.factor(spor$Fedu)
- levels(spor$Fedu) <- c("none", "4th grade", "5th to 9th grade", "secondary eduction", "higher education")
- spor$studytime <- as.factor(spor$studytime)
- levels(spor$studytime) <- c("<2", "2 to 5", "5 to 10", ">10")
- linmod <- glm(alc ~., data = spor, family = "binomial")
- summary(linmod)
- ```
- ```{r}
- library(kernlab)
- library(caret)
- library(caretEnsemble)
- library(forecast)
- spor <- as.data.frame(read.table("spor.csv", header = T, sep = ","))
- spor$age2 <- 0
- spor$age2[spor$age >= 19] <- 1
- spor$age <- NULL
- data <- spor
- set.seed(11)
- intrain <- createDataPartition(y = data$alc, p= 0.8, list = FALSE)
- data_train <- data[intrain,]
- data_test <- data[-intrain,]
- trctrl <- trainControl(method = "repeatedcv", number=10, repeats=5)
- #alc_fit <- train(as.factor(alc) ~., data = data_train, method = "svmLinear", trControl=trctrl, preProcess = c("center", "scale"), tuneLength=20)
- #alc_fit
- alc_fit2 <- train(as.factor(alc) ~ sex + famsize * famrel + Pstatus * famrel + studytime * absences + health + absences * goout, data = data_train, method = "svmLinear", trControl=trctrl, preProcess = c("center", "scale"), tuneLength=20)
- alc_fit2
- ```
- ```{r}
- data_test$svm1<-predict(alc_fit2, newdata=data_test)
- #data_test$svm1<-as.integer(ifelse(data_test$svm1 >= 0.5, 1, 0))
- confusionMatrix(as.factor(data_test$alc), as.factor(data_test$svm1))
- ```
- The base accuracy and kappa are 65.59% and 25.88%. We will try to increase that by ensembling models
- ```{r}
- library(xgboost)
- fit2 <- train(as.factor(alc) ~., data = data_train, metric="Accuracy", method = "xgbLinear", trControl=trctrl, preProcess = c("center", "scale"), tuneLength=1)
- data_test$fit2<-predict(fit2, newdata=data_test)
- confusionMatrix(as.factor(data_test$alc), as.factor(data_test$fit2))
- ```
- ```{r}
- fit3 <- train(as.factor(alc) ~., data = data_train, metric="Accuracy", method = "xgbTree", trControl=trctrl, preProcess = c("center", "scale"), tuneLength=1)
- data_test$fit3<-predict(fit3, newdata=data_test)
- confusionMatrix(as.factor(data_test$alc), as.factor(data_test$fit3))
- ```
- ```{r}
- fit4 <- train(as.factor(alc) ~., data = data_train, metric="Accuracy", method = "xgbLinear", trControl=trctrl, preProcess = c("center", "scale"), tuneLength=1, tuneGrid=data.frame(nrounds=45, lambda=0, alpha=0, eta=0.5))
- data_test$fit4<-predict(fit4, newdata=data_test)
- confusionMatrix(as.factor(data_test$alc), as.factor(data_test$fit4))
- ```
- ```{r}
- data_test$ensemble <- as.numeric(as.character(data_test$svm1)) + as.numeric(as.character(data_test$fit2)) + as.numeric(as.character(data_test$fit3)) + as.numeric(as.character(data_test$fit4)) * 2
- data_test$ensemble_predicted <- ifelse(data_test$ensemble >= 3, 1, 0)
- data_test$ensemble_predicted <- as.factor(data_test$ensemble_predicted)
- confusionMatrix(as.factor(data_test$alc), data_test$ensemble_predicted)
- ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement