Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ```{r}
- library("readxl")
- library(glmnet)
- data <- read_excel("spambase.xlsx")
- #set.seed(12345)
- #id <- sample(1:n, floor(n*0.5))
- #train <- data[id,]
- #test <- data[-id,]
- B <- 100
- n <- nrow(data)
- #b1 <- list()
- #b2 <- list()
- #for (i in 1:B)
- #{
- # b1[[i]] <- train[sample(1:nrow(train), n, replace = TRUE),]
- # b2[[i]] <- train[sample(1:nrow(train), n, replace = TRUE),]
- #}
- p <- ncol(data[,!names(data) %in% c("Spam")])
- q1 <- round(p/2)
- q2 <- round(p/2)
- nfolds <- 4
- #lambda=seq(0,10,length.out=1000)
- beta_hat <- rep(0, p)
- names(beta_hat) <- colnames(data[,!names(data) %in% c("Spam")])
- counts <- rep(0, p) # NEW LINE
- names(counts) <- colnames(data[,!names(data) %in% c("Spam")]) # NEW LINE
- set.seed(12345) # NEW LINE
- for (i in 1:B)
- {
- #b <- b1[[i]]
- b <- data[sample(n, n, replace = TRUE),] # NEW LINE
- x <- as.matrix(b[,!names(b) %in% c("Spam")])
- #set.seed(12345)
- x <- x[,sample(1:ncol(x), q1)]
- y <- as.matrix(b[,"Spam"])
- lasso_cv <- cv.glmnet(x=x,y=y, alpha=1, nfolds=nfolds, family = "binomial")
- #lambda_min <- lasso_cv$lambda.min
- #lasso <- glmnet(x=x, y=y, alpha = 1, lambda = lambda_min)
- #coef <- coef(lasso)
- coef <- coef(lasso_cv, s = "lambda.min") # NEW LINE
- dimnames <- coef@Dimnames[1][[1]]
- for (j in 2:length(coef))
- {
- beta_hat[dimnames[j]] <- beta_hat[dimnames[j]]+coef[j]
- counts[dimnames[j]] <- counts[dimnames[j]]+1 # NEW LINE
- }
- }
- # for ( i in 1:length(counts)){
- # counts[i] <- ifelse(counts[i] == 0, 1, counts[i]) # NEW LINE
- # }
- importance <- abs(beta_hat/counts)
- beta_hat <- rep(0, p)
- names(beta_hat) <- colnames(data[,!names(data) %in% c("Spam")])
- intercept <- 0
- counts <- rep(0, p) # NEW LINE
- names(counts) <- colnames(data[,!names(data) %in% c("Spam")]) # NEW LINE
- set.seed(12345) # NEW LINE
- for (i in 1:B)
- {
- #b <- b2[[i]]
- b <- data[sample(n, n, replace = TRUE),] # NEW LINE
- x <- as.matrix(b[,!names(b) %in% c("Spam")])
- #set.seed(12345)
- x <- x[,sample(1:ncol(x), q2, prob = importance)]
- y <- as.matrix(b[,"Spam"])
- lasso_cv <- cv.glmnet(x=x,y=y, alpha=1, nfolds=nfolds, family = "binomial")
- #lambda_min <- lasso_cv$lambda.min
- #lasso <- glmnet(x=x, y=y, alpha = 1, lambda = lambda_min)
- #coef <- coef(lasso)
- coef <- coef(lasso_cv, s = "lambda.min") # NEW LINE
- dimnames <- coef@Dimnames[1][[1]]
- for (j in 2:length(coef))
- {
- beta_hat[dimnames[j]] <- beta_hat[dimnames[j]]+coef[j]
- counts[dimnames[j]] <- counts[dimnames[j]]+1 # NEW LINE
- }
- intercept <- intercept + coef[1]
- }
- for ( i in 1:length(counts)){
- counts[i] <- ifelse(counts[i] == 0, 1, counts[i]) # NEW LINE
- }
- beta_hat <- beta_hat/counts
- intercept <- intercept/B
- ```
- ```{r}
- predict_own <- function(x, weights, intercept)
- {
- predictions <- vector(length = nrow(x))
- for (i in 1:nrow(x))
- {
- #predictions[i] <- ifelse(sum(x[i,]*weights)+intercept < 0.5, 0, 1)
- odds <- sum(x[i,]*weights)+intercept # NEW LINE
- prob <- 1/(1+exp(-odds)) # NEW LINE
- predictions[i] <- ifelse(prob > 0.5, 0, 1) # NEW LINE
- }
- return(predictions)
- }
- x <- data[,!names(data) %in% c("Spam")]
- y <- t(data[,"Spam"])
- predictions <- predict_own(x, beta_hat, intercept)
- importance <- sort(importance, decreasing = TRUE)
- barplot(importance[1:10], ylab = "Importance", xlab = "Feature")
- confusion_matrix <- table(y, predictions)
- confusion_matrix
- cat("Accuracy: ", sum(diag(confusion_matrix))/sum(confusion_matrix))
- ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement