Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rm(list= ls())
- library(ggplot2)
- library(GGally)
- library(pROC)
- # Hosmer-Lemeshow goodness of fit test:
- library(ResourceSelection)
- hospital <- read.delim("Data/hospital.txt", sep = ";")
- #first part
- hospital$health_cat <- factor(hospital$health,
- levels = c(1, 2, 3),
- labels = c("good", "bad", "somewhere in between"))
- hospital$hosp_cat <- factor(hospital$hosp,
- levels = c(0, 1),
- labels = c("0 days", "1+ days"))
- hospital$health_cat <- factor(hospital$health,
- levels = c(1, 2, 3),
- labels = c("good", "bad", "somewhere in between"))
- hospital$health_cat <- relevel(hospital$health, "good")
- ################## 2(a)
- hospital$sex <- factor(hospital$sex,
- levels = c(1, 2),
- labels = c("Male", "Female"))
- table(hospital$sex)
- hospital$sex <- relevel(hospital$sex, "Female")
- hospital$civilst <- factor(hospital$civilst,
- levels = c(1, 2, 3, 4),
- labels = c("unmarried", "married", "divorsed", "widow"))
- table(hospital$civilst)
- hospital$civilst <- relevel(hospital$civilst, "married")
- hospital$exercise <- factor(hospital$exercise,
- levels = c(0, 1, 2, 3, 4),
- labels = c("no", "sometimes", "once_a_week", "twice_a_week", "strenously"))
- hospital$exercise <- relevel(hospital$exercise, "sometimes")
- table(hospital$exercise)
- hospital$work_norm <- factor(hospital$work_norm,
- levels = c(1, 2, 3, 4, 5),
- labels = c("1-19h", "20–34h", "35–97h", "self-employed", "other"))
- hospital$work_norm <- relevel(hospital$work_norm, "other")
- table(hospital$work_norm)
- hospital_nobetween <- hospital
- indx <- which(hospital_nobetween$health == 3)
- hospital_nobetween$health[indx] = 2
- hospital_nobetween$health_cat <- factor(hospital_nobetween$health,
- levels = c(1, 2),
- labels = c("good", "bad"))
- ###
- ### Models
- ###
- #### Models
- model.null <- glm(hosp_cat ~ 1 ,family="binomial", data=hospital)
- model.health <- glm(hosp_cat ~ health_cat , family = "binomial" , data=hospital)
- model.agesquared <- glm( hosp_cat ~ age+ I(age^2), family = "binomial", data= hospital)
- model.AIC <- glm(hosp_cat ~ health_cat + sex + age + I(age^2) + civilst + inc_hh,family="binomial" ,data=hospital)
- model.BIC2 <- glm(hosp_cat ~ age + health_cat + sex + I(age^2) , family = "binomial", data = hospital_nobetween)
- model.BIC3 <- glm(hosp_cat ~ age + I(age^2) + health_cat + sex, family="binomial", data=hospital)
- ###
- #### (a)
- ###
- pred.phat <- cbind(
- hospital,
- p.null = predict(model.null, type = "response"),
- p.health = predict(model.health, type = "response"),
- p.agesquared = predict(model.agesquared, type = "response"),
- p.AIC = predict(model.AIC, type = "response"),
- p.BIC2 = predict(model.BIC2, type = "response"),
- p.BIC3 = predict(model.BIC3, type = "response"))
- head(pred.phat)
- ##### Confusion matrix for model 3 and model oslo####
- ####### Calculate Y-hat using model 3 and model oslo.
- (pred.phat$yhat.AIC <- as.numeric(pred.phat$p.AIC > 0.5))
- # pred.phat$yhat.oslo <- as.numeric(pred.phat$p.oslo > 0.5)
- (row.01 <- table(hospital$hosp))
- # the confiusion matrix is a confiusion vectore just ad [1 0 0]' at the end
- (col.01.AIC <- table(pred.phat$yhat.AIC))
- (confusion.AIC <- table(pred.phat$hosp, pred.phat$yhat.AIC))
- (spec.AIC <- confusion.AIC[1, 1] /confusion.AIC[1, 1] )
- (sens.AIC <- 0 )
- (accu.AIC <- sum(diag(confusion.AIC)) / sum(confusion.AIC))
- (prec.AIC <- NaN)
- ####
- ### (b)
- ###
- pred.phat <- cbind(
- hospital,
- p.null = predict(model.null, type = "response"),
- p.health = predict(model.health, type = "response"),
- p.agesquared = predict(model.agesquared, type = "response"),
- p.AIC = predict(model.AIC, type = "response"),
- p.BIC2 = predict(model.BIC2, type = "response"),
- p.BIC3 = predict(model.BIC3, type = "response"))
- head(pred.phat)
- # ROC-curves####
- # Calculate for model 0 and 3.
- ## FOr null model
- (roc.null <- roc(hosp ~ p.null, data = pred.phat))
- # save the coordinates in a data frame for plotting.
- roc.df.null <- coords(roc.null, transpose = FALSE)
- roc.df.null$model <- "null"
- roc.df.null
- ## for Health model
- (roc.health <- roc(hosp ~ p.health, data = pred.phat))
- # save the coordinates in a data frame for plotting.
- roc.df.health <- coords(roc.health, transpose = FALSE)
- roc.df.health$model <- "health"
- head(roc.df.health)
- ## for age^2 model
- (roc.agesquared <- roc(hosp ~ p.agesquared, data = pred.phat))
- # save the coordinates in a data frame for plotting.
- roc.df.agesquared <- coords(roc.agesquared, transpose = FALSE)
- roc.df.agesquared$model <- "agesquared"
- head(roc.df.agesquared)
- ## for AIC model
- (roc.AIC <- roc(hosp ~ p.AIC, data = pred.phat))
- # save the coordinates in a data frame for plotting.
- roc.df.AIC <- coords(roc.AIC, transpose = FALSE)
- roc.df.AIC$model <- "AIC"
- head(roc.df.AIC)
- ## for BIC2 model
- (roc.BIC2 <- roc(hosp ~ p.BIC2, data = pred.phat))
- # save the coordinates in a data frame for plotting.
- roc.df.BIC2 <- coords(roc.BIC2, transpose = FALSE)
- roc.df.BIC2$model <- "BIC2"
- head(roc.df.BIC2)
- ## for BIC3 model
- (roc.BIC3 <- roc(hosp ~ p.BIC3, data = pred.phat))
- # save the coordinates in a data frame for plotting.
- roc.df.BIC3 <- coords(roc.BIC3, transpose = FALSE)
- roc.df.BIC3$model <- "BIC3"
- head(roc.df.BIC3)
- roc.df <- rbind(roc.df.health, roc.df.agesquared, roc.df.AIC, roc.df.BIC2, roc.df.BIC3 )
- # Create the data for the Ideal model by hand:
- roc.df.ideal <- data.frame(sensitivity = c(0, 1, 1),
- specificity = c(1, 1, 0),
- threshold = c(NA, NA, NA))
- roc.df.ideal$model <- "ideal"
- ggplot(roc.df, aes(specificity, sensitivity,
- color = model)) +
- geom_path(size = 1) +
- coord_fixed() + # square plotting area
- scale_x_reverse() + # Reverse scale on the x-axis!
- labs(title = "ROC-curves for all the models") +
- theme(text = element_text(size = 14))
- #Collect AUC and intervals for all the models:
- (aucs <-
- data.frame(
- model = c ("health", "agesquared", "AIC", "BIC2", "BIC3"),
- auc = c(auc(roc.health), auc(roc.agesquared), auc(roc.AIC), auc(roc.BIC2),
- auc(roc.BIC3)),
- lwr = c(ci(roc.health)[1],
- ci(roc.agesquared)[1],
- ci(roc.AIC)[1], ci(roc.BIC2)[1],
- ci(roc.BIC3)[1]),
- upr = c(ci(auc(roc.health))[3], ci(auc(roc.agesquared))[3],
- ci(auc(roc.AIC))[3], ci(auc(roc.BIC2))[3],
- ci(auc(roc.BIC3))[3])))
- # Compare the AUC for the models:
- roc.test(roc.AIC, roc.health)
- roc.test(roc.AIC, roc.agesquared)
- roc.test(roc.AIC, roc.BIC2)
- roc.test(roc.AIC, roc.BIC3)
- ####
- #### (c)
- ###
- # experiment with different values of levels for sens and spec, level.ss, to find
- # the one that gives the optimal combination of sens and spec.
- level.ss <- 0.6255
- roc.df.AIC[roc.df.AIC$sensitivity > level.ss &
- roc.df.AIC$specificity > level.ss, ]
- ##
- (I_max.AIC <- which(roc.df.AIC$sensitivity > level.ss &
- roc.df.AIC$specificity > level.ss))
- roc.df.AIC[I_max.AIC, ]
- # Pick out the corresponding threshold for p:
- roc.df.AIC[I_max.AIC, "threshold"]
- (pred.phat$yhat.AIC <- as.numeric(pred.phat$p.AIC > roc.df.AIC[I_max.AIC, "threshold"]))
- # pred.phat$yhat.oslo <- as.numeric(pred.phat$p.oslo > 0.5)
- (row.01 <- table(hospital$hosp))
- # the confiusion matrix is a confiusion vectore just ad [1 0 0]' at the end
- (col.01.AIC <- table(pred.phat$yhat.AIC))
- (confusion.AIC <- table(pred.phat$hosp, pred.phat$yhat.AIC))
- (spec.AIC <- confusion.AIC[1, 1] / (confusion.AIC[1, 1]+confusion.AIC[1, 2]) )
- (sens.AIC <- confusion.AIC[2, 2] / (confusion.AIC[2, 1]+confusion.AIC[2, 2]) )
- (accu.AIC <- sum(diag(confusion.AIC)) / sum(confusion.AIC))
- (prec.AIC <- confusion.AIC[2, 2] / (confusion.AIC[1, 2]+confusion.AIC[2, 2]) )
- # Hosmer-Lemeshow-test####
- # Illustrating example: plot in sorted p-order
- # order(variable) gives the ranks for the values in variable.
- # It can then be used to sort the data frame:
- pred.sort <- pred.phat[order(pred.phat$p.AIC), ]
- pred.sort$rank <- seq(1, nrow(pred.sort))
- head(pred.sort)
- # Divide the n=500 observations into g=10 groups:
- n <- nrow(pred.sort)
- g <- 10
- # with ng = 50 observations each:
- ng <- n/g
- # Plot p_i and Y_i
- # Add i vertical jitter to Y_i to separate them
- ggplot(pred.sort, aes(rank, p.AIC)) +
- geom_point() +
- geom_jitter(aes(y = hosp), height = 0.01) +
- geom_vline(xintercept = seq(ng, nrow(pred.sort) - ng, ng)) +
- labs(title = "Model 3: Estimated probabilities by increasing size",
- caption = "g = 10 groups",
- x = "(i) = 1,...,n", y = "p-hat") +
- theme(text = element_text(size = 14))
- # HL by hand####
- # The following can be done using the output of the
- # hoslem.test function, see below.
- # I only do it here to illustrate the steps.
- # A for-loop to set the group numbers:
- pred.sort$group <- NA
- for (k in seq(1, g)) {
- I <- (k - 1)*ng + seq(1, ng)
- pred.sort$group[I] <- k
- }
- head(pred.sort)
- # Calculate Observed and Expected in each group:
- # aggregate(y ~ x, FUN = mean) calculates the mean of y
- # separately for each group.
- # merge(data1, data2) joins two data frames using any common
- # variables as keys, in this case, "group".
- # Number of successes:
- OE1 <- merge(aggregate(hosp ~ group, data = pred.sort, FUN = sum),
- aggregate(p.AIC ~ group, data = pred.sort, FUN = sum))
- OE1
- # Number of failures = n_g - successes:
- OE0 <- OE1
- OE0$hosp <- ng - OE1$hosp
- OE0$p.AIC <- ng - OE1$p.AIC
- # A variable to use for color coding:
- OE1$outcome <- "Y = 1"
- OE0$outcome <- "Y = 0"
- # Bind the two data sets as rows (r):
- (OE <- rbind(OE1, OE0))
- # And plot:
- # Set the tickmarks on the x-axis to integers 1,...,g
- # Note the linetype inside the aes() to set different
- # linetype to O and E automatically
- ggplot(OE, aes(group, p.AIC, color = outcome)) +
- geom_line(aes(linetype = "expected"), size = 1) +
- geom_line(aes(y = hosp, linetype = "observed"), size = 1) +
- labs(title = "Model AIC: Observed and expected in each group",
- y = "number of observations") +
- theme(text = element_text(size = 14)) +
- scale_x_continuous(breaks = seq(1, g))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement