Untitled

rm(list= ls())
library(ggplot2)
library(GGally)
library(pROC)
# Hosmer-Lemeshow goodness of fit test:
library(ResourceSelection)

hospital <- read.delim("Data/hospital.txt", sep = ";")
#first part

hospital$health_cat <- factor(hospital$health,
                              levels = c(1, 2, 3),
                              labels = c("good", "bad", "somewhere in between"))


hospital$hosp_cat <- factor(hospital$hosp,
                            levels = c(0, 1),
                            labels = c("0 days", "1+ days"))

hospital$health_cat <- factor(hospital$health,
                              levels = c(1, 2, 3),
                              labels = c("good", "bad", "somewhere in between"))

hospital$health_cat <- relevel(hospital$health, "good")


################## 2(a)

hospital$sex <- factor(hospital$sex,
                       levels = c(1, 2),
                       labels = c("Male", "Female"))

table(hospital$sex)
hospital$sex <- relevel(hospital$sex, "Female")

hospital$civilst <- factor(hospital$civilst,
                           levels = c(1, 2, 3, 4),
                           labels = c("unmarried", "married", "divorsed", "widow"))
table(hospital$civilst)
hospital$civilst <- relevel(hospital$civilst, "married")


hospital$exercise <- factor(hospital$exercise,
                            levels = c(0, 1, 2, 3, 4),
                            labels = c("no", "sometimes", "once_a_week", "twice_a_week", "strenously"))
hospital$exercise <- relevel(hospital$exercise, "sometimes")
table(hospital$exercise)

hospital$work_norm <- factor(hospital$work_norm,
                             levels = c(1, 2, 3, 4, 5),
                             labels = c("1-19h", "20–34h", "35–97h", "self-employed", "other"))
hospital$work_norm <- relevel(hospital$work_norm, "other")
table(hospital$work_norm)


hospital_nobetween <- hospital
indx <- which(hospital_nobetween$health == 3)
hospital_nobetween$health[indx] = 2

hospital_nobetween$health_cat <- factor(hospital_nobetween$health,
                                        levels = c(1, 2),
                                        labels = c("good", "bad"))


###
### Models
###


#### Models
model.null <- glm(hosp_cat ~ 1 ,family="binomial", data=hospital)
model.health <- glm(hosp_cat ~ health_cat , family = "binomial" , data=hospital)
model.agesquared <- glm( hosp_cat ~ age+ I(age^2), family = "binomial", data= hospital)
model.AIC   <-  glm(hosp_cat ~ health_cat + sex + age + I(age^2) + civilst + inc_hh,family="binomial" ,data=hospital)
model.BIC2  <-  glm(hosp_cat ~ age + health_cat + sex + I(age^2) , family = "binomial", data = hospital_nobetween)
model.BIC3    <- glm(hosp_cat ~ age + I(age^2) + health_cat + sex, family="binomial", data=hospital)


###
#### (a)
###


pred.phat <- cbind(
  hospital,
  p.null = predict(model.null, type = "response"),
  p.health = predict(model.health, type = "response"),
  p.agesquared = predict(model.agesquared, type = "response"),
  p.AIC = predict(model.AIC, type = "response"),
  p.BIC2 = predict(model.BIC2, type = "response"),
  p.BIC3 = predict(model.BIC3, type = "response"))

head(pred.phat)


##### Confusion matrix for model 3 and model oslo####
####### Calculate Y-hat using model 3 and model oslo.

(pred.phat$yhat.AIC <- as.numeric(pred.phat$p.AIC > 0.5))
# pred.phat$yhat.oslo <- as.numeric(pred.phat$p.oslo > 0.5)

(row.01 <- table(hospital$hosp))

# the confiusion matrix is a confiusion vectore just ad [1 0 0]' at the end
(col.01.AIC <- table(pred.phat$yhat.AIC))
(confusion.AIC <- table(pred.phat$hosp, pred.phat$yhat.AIC))
(spec.AIC <- confusion.AIC[1, 1] /confusion.AIC[1, 1] )
(sens.AIC <- 0 )
(accu.AIC <- sum(diag(confusion.AIC)) / sum(confusion.AIC))
(prec.AIC <- NaN)


####
###  (b)
###


pred.phat <- cbind(
  hospital,
  p.null = predict(model.null, type = "response"),
  p.health = predict(model.health, type = "response"),
  p.agesquared = predict(model.agesquared, type = "response"),
  p.AIC = predict(model.AIC, type = "response"),
  p.BIC2 = predict(model.BIC2, type = "response"),
  p.BIC3 = predict(model.BIC3, type = "response"))

head(pred.phat)


# ROC-curves####
# Calculate for model 0 and 3.

## FOr null model
(roc.null <- roc(hosp ~ p.null, data = pred.phat))
# save the coordinates in a data frame for plotting.
roc.df.null <- coords(roc.null, transpose = FALSE)
roc.df.null$model <- "null"
roc.df.null


## for Health model
(roc.health <- roc(hosp ~ p.health, data = pred.phat))
# save the coordinates in a data frame for plotting.
roc.df.health <- coords(roc.health, transpose = FALSE)
roc.df.health$model <- "health"
head(roc.df.health)

## for age^2 model
(roc.agesquared <- roc(hosp ~ p.agesquared, data = pred.phat))
# save the coordinates in a data frame for plotting.
roc.df.agesquared <- coords(roc.agesquared, transpose = FALSE)
roc.df.agesquared$model <- "agesquared"
head(roc.df.agesquared)


## for AIC model
(roc.AIC <- roc(hosp ~ p.AIC, data = pred.phat))
# save the coordinates in a data frame for plotting.
roc.df.AIC <- coords(roc.AIC, transpose = FALSE)
roc.df.AIC$model <- "AIC"
head(roc.df.AIC)


## for BIC2 model
(roc.BIC2 <- roc(hosp ~ p.BIC2, data = pred.phat))
# save the coordinates in a data frame for plotting.
roc.df.BIC2 <- coords(roc.BIC2, transpose = FALSE)
roc.df.BIC2$model <- "BIC2"
head(roc.df.BIC2)


## for BIC3 model
(roc.BIC3 <- roc(hosp ~ p.BIC3, data = pred.phat))
# save the coordinates in a data frame for plotting.
roc.df.BIC3 <- coords(roc.BIC3, transpose = FALSE)
roc.df.BIC3$model <- "BIC3"
head(roc.df.BIC3)


roc.df <- rbind(roc.df.health, roc.df.agesquared, roc.df.AIC, roc.df.BIC2, roc.df.BIC3 )


# Create the data for the Ideal model by hand:
roc.df.ideal <- data.frame(sensitivity = c(0, 1, 1),
                           specificity = c(1, 1, 0),
                           threshold = c(NA, NA, NA))
roc.df.ideal$model <- "ideal"


ggplot(roc.df, aes(specificity, sensitivity,
                   color = model)) +
  geom_path(size = 1) +
  coord_fixed() +       # square plotting area
  scale_x_reverse() +   # Reverse scale on the x-axis!
  labs(title = "ROC-curves for all the models") +
  theme(text = element_text(size = 14))


#Collect AUC and intervals for all the models:
(aucs <-
    data.frame(
      model = c ("health", "agesquared", "AIC", "BIC2", "BIC3"),
      auc = c(auc(roc.health), auc(roc.agesquared), auc(roc.AIC), auc(roc.BIC2),
              auc(roc.BIC3)),
      lwr = c(ci(roc.health)[1],
              ci(roc.agesquared)[1],
              ci(roc.AIC)[1], ci(roc.BIC2)[1],
              ci(roc.BIC3)[1]),
      upr = c(ci(auc(roc.health))[3], ci(auc(roc.agesquared))[3],
              ci(auc(roc.AIC))[3], ci(auc(roc.BIC2))[3],
              ci(auc(roc.BIC3))[3])))


# Compare the AUC for the models:
roc.test(roc.AIC, roc.health)
roc.test(roc.AIC, roc.agesquared)
roc.test(roc.AIC, roc.BIC2)
roc.test(roc.AIC, roc.BIC3)


####
####   (c)
###

# experiment with different values of levels for sens and spec, level.ss, to find
# the one that gives the optimal combination of sens and spec.
level.ss <- 0.6255
roc.df.AIC[roc.df.AIC$sensitivity > level.ss &
             roc.df.AIC$specificity > level.ss, ]
##
(I_max.AIC <- which(roc.df.AIC$sensitivity > level.ss &
                      roc.df.AIC$specificity > level.ss))
roc.df.AIC[I_max.AIC, ]
# Pick out the corresponding threshold for p:
roc.df.AIC[I_max.AIC, "threshold"]


(pred.phat$yhat.AIC <- as.numeric(pred.phat$p.AIC > roc.df.AIC[I_max.AIC, "threshold"]))
# pred.phat$yhat.oslo <- as.numeric(pred.phat$p.oslo > 0.5)

(row.01 <- table(hospital$hosp))

# the confiusion matrix is a confiusion vectore just ad [1 0 0]' at the end
(col.01.AIC <- table(pred.phat$yhat.AIC))
(confusion.AIC <- table(pred.phat$hosp, pred.phat$yhat.AIC))
(spec.AIC <- confusion.AIC[1, 1] / (confusion.AIC[1, 1]+confusion.AIC[1, 2]) )
(sens.AIC <- confusion.AIC[2, 2] / (confusion.AIC[2, 1]+confusion.AIC[2, 2]) )
(accu.AIC <- sum(diag(confusion.AIC)) / sum(confusion.AIC))
(prec.AIC <- confusion.AIC[2, 2] / (confusion.AIC[1, 2]+confusion.AIC[2, 2])  )


# Hosmer-Lemeshow-test####
# Illustrating example: plot in sorted p-order
# order(variable) gives the ranks for the values in variable.
# It can then be used to sort the data frame:
pred.sort <- pred.phat[order(pred.phat$p.AIC), ]
pred.sort$rank <- seq(1, nrow(pred.sort))
head(pred.sort)


  # Divide the n=500 observations into g=10 groups:
  n <- nrow(pred.sort)
  g <- 10
  # with ng = 50 observations each:
  ng <- n/g

  # Plot p_i and Y_i
  # Add i vertical jitter to Y_i to separate them
  ggplot(pred.sort, aes(rank, p.AIC)) +
    geom_point() +
    geom_jitter(aes(y = hosp), height = 0.01) +
    geom_vline(xintercept = seq(ng, nrow(pred.sort) - ng, ng)) +
    labs(title = "Model 3: Estimated probabilities by increasing size",
         caption = "g = 10 groups",
         x = "(i) = 1,...,n", y = "p-hat") +
    theme(text = element_text(size = 14))


  # HL by hand####
  # The following can be done using the output of the
  # hoslem.test function, see below.
  # I only do it here to illustrate the steps.

  # A for-loop to set the group numbers:
  pred.sort$group <- NA
  for (k in seq(1, g)) {
    I <- (k - 1)*ng + seq(1, ng)
    pred.sort$group[I] <- k
  }
  head(pred.sort)

  # Calculate Observed and Expected in each group:
  # aggregate(y ~ x, FUN = mean) calculates the mean of y
  # separately for each group.
  # merge(data1, data2) joins two data frames using any common
  # variables as keys, in this case, "group".

  # Number of successes:
  OE1 <- merge(aggregate(hosp ~ group, data = pred.sort, FUN = sum),
               aggregate(p.AIC ~ group, data = pred.sort, FUN = sum))
  OE1
  # Number of failures = n_g - successes:
  OE0 <- OE1
  OE0$hosp <- ng - OE1$hosp
  OE0$p.AIC <- ng - OE1$p.AIC
  # A variable to use for color coding:
  OE1$outcome <- "Y = 1"
  OE0$outcome <- "Y = 0"
  # Bind the two data sets as rows (r):
  (OE <- rbind(OE1, OE0))


  # And plot:
  # Set the tickmarks on the x-axis to integers 1,...,g
  # Note the linetype inside the aes() to set different
  # linetype to O and E automatically

  ggplot(OE, aes(group, p.AIC, color = outcome)) +
    geom_line(aes(linetype = "expected"), size = 1) +
    geom_line(aes(y = hosp, linetype = "observed"), size = 1) +
    labs(title = "Model AIC: Observed and expected in each group",
         y = "number of observations") +
    theme(text = element_text(size = 14)) +
    scale_x_continuous(breaks = seq(1, g))