Advertisement
gl0Ppy

Untitled

May 9th, 2022
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 10.64 KB | None | 0 0
  1. rm(list= ls())
  2. library(ggplot2)
  3. library(GGally)
  4. library(pROC)
  5. # Hosmer-Lemeshow goodness of fit test:
  6. library(ResourceSelection)
  7.  
  8. hospital <- read.delim("Data/hospital.txt", sep = ";")
  9. #first part
  10.  
  11. hospital$health_cat <- factor(hospital$health,
  12.                               levels = c(1, 2, 3),
  13.                               labels = c("good", "bad", "somewhere in between"))
  14.  
  15.  
  16. hospital$hosp_cat <- factor(hospital$hosp,
  17.                             levels = c(0, 1),
  18.                             labels = c("0 days", "1+ days"))
  19.  
  20. hospital$health_cat <- factor(hospital$health,
  21.                               levels = c(1, 2, 3),
  22.                               labels = c("good", "bad", "somewhere in between"))
  23.  
  24. hospital$health_cat <- relevel(hospital$health, "good")
  25.  
  26.  
  27. ################## 2(a)
  28.  
  29. hospital$sex <- factor(hospital$sex,
  30.                        levels = c(1, 2),
  31.                        labels = c("Male", "Female"))
  32.  
  33. table(hospital$sex)
  34. hospital$sex <- relevel(hospital$sex, "Female")
  35.  
  36. hospital$civilst <- factor(hospital$civilst,
  37.                            levels = c(1, 2, 3, 4),
  38.                            labels = c("unmarried", "married", "divorsed", "widow"))
  39. table(hospital$civilst)
  40. hospital$civilst <- relevel(hospital$civilst, "married")
  41.  
  42.  
  43. hospital$exercise <- factor(hospital$exercise,
  44.                             levels = c(0, 1, 2, 3, 4),
  45.                             labels = c("no", "sometimes", "once_a_week", "twice_a_week", "strenously"))
  46. hospital$exercise <- relevel(hospital$exercise, "sometimes")
  47. table(hospital$exercise)
  48.  
  49. hospital$work_norm <- factor(hospital$work_norm,
  50.                              levels = c(1, 2, 3, 4, 5),
  51.                              labels = c("1-19h", "20–34h", "35–97h", "self-employed", "other"))
  52. hospital$work_norm <- relevel(hospital$work_norm, "other")
  53. table(hospital$work_norm)
  54.  
  55.  
  56. hospital_nobetween <- hospital
  57. indx <- which(hospital_nobetween$health == 3)
  58. hospital_nobetween$health[indx] = 2
  59.  
  60. hospital_nobetween$health_cat <- factor(hospital_nobetween$health,
  61.                                         levels = c(1, 2),
  62.                                         labels = c("good", "bad"))
  63.  
  64.  
  65. ###
  66. ### Models
  67. ###
  68.  
  69.  
  70. #### Models
  71. model.null <- glm(hosp_cat ~ 1 ,family="binomial", data=hospital)
  72. model.health <- glm(hosp_cat ~ health_cat , family = "binomial" , data=hospital)
  73. model.agesquared <- glm( hosp_cat ~ age+ I(age^2), family = "binomial", data= hospital)
  74. model.AIC   <-  glm(hosp_cat ~ health_cat + sex + age + I(age^2) + civilst + inc_hh,family="binomial" ,data=hospital)
  75. model.BIC2  <-  glm(hosp_cat ~ age + health_cat + sex + I(age^2) , family = "binomial", data = hospital_nobetween)
  76. model.BIC3    <- glm(hosp_cat ~ age + I(age^2) + health_cat + sex, family="binomial", data=hospital)
  77.  
  78.  
  79.  
  80.  
  81.  
  82. ###
  83. #### (a)
  84. ###
  85.  
  86.  
  87.  
  88. pred.phat <- cbind(
  89.   hospital,
  90.   p.null = predict(model.null, type = "response"),
  91.   p.health = predict(model.health, type = "response"),
  92.   p.agesquared = predict(model.agesquared, type = "response"),
  93.   p.AIC = predict(model.AIC, type = "response"),
  94.   p.BIC2 = predict(model.BIC2, type = "response"),
  95.   p.BIC3 = predict(model.BIC3, type = "response"))
  96.  
  97. head(pred.phat)
  98.  
  99.  
  100.  
  101.  
  102. ##### Confusion matrix for model 3 and model oslo####
  103. ####### Calculate Y-hat using model 3 and model oslo.
  104.  
  105. (pred.phat$yhat.AIC <- as.numeric(pred.phat$p.AIC > 0.5))
  106. # pred.phat$yhat.oslo <- as.numeric(pred.phat$p.oslo > 0.5)
  107.  
  108. (row.01 <- table(hospital$hosp))
  109.  
  110. # the confiusion matrix is a confiusion vectore just ad [1 0 0]' at the end
  111. (col.01.AIC <- table(pred.phat$yhat.AIC))
  112. (confusion.AIC <- table(pred.phat$hosp, pred.phat$yhat.AIC))
  113. (spec.AIC <- confusion.AIC[1, 1] /confusion.AIC[1, 1] )
  114. (sens.AIC <- 0 )
  115. (accu.AIC <- sum(diag(confusion.AIC)) / sum(confusion.AIC))
  116. (prec.AIC <- NaN)
  117.  
  118.  
  119.  
  120. ####
  121. ###  (b)
  122. ###
  123.  
  124.  
  125.  
  126.  
  127. pred.phat <- cbind(
  128.   hospital,
  129.   p.null = predict(model.null, type = "response"),
  130.   p.health = predict(model.health, type = "response"),
  131.   p.agesquared = predict(model.agesquared, type = "response"),
  132.   p.AIC = predict(model.AIC, type = "response"),
  133.   p.BIC2 = predict(model.BIC2, type = "response"),
  134.   p.BIC3 = predict(model.BIC3, type = "response"))
  135.  
  136. head(pred.phat)
  137.  
  138.  
  139.  
  140. # ROC-curves####
  141. # Calculate for model 0 and 3.
  142.  
  143. ## FOr null model
  144. (roc.null <- roc(hosp ~ p.null, data = pred.phat))
  145. # save the coordinates in a data frame for plotting.
  146. roc.df.null <- coords(roc.null, transpose = FALSE)
  147. roc.df.null$model <- "null"
  148. roc.df.null
  149.  
  150.  
  151. ## for Health model
  152. (roc.health <- roc(hosp ~ p.health, data = pred.phat))
  153. # save the coordinates in a data frame for plotting.
  154. roc.df.health <- coords(roc.health, transpose = FALSE)
  155. roc.df.health$model <- "health"
  156. head(roc.df.health)
  157.  
  158. ## for age^2 model
  159. (roc.agesquared <- roc(hosp ~ p.agesquared, data = pred.phat))
  160. # save the coordinates in a data frame for plotting.
  161. roc.df.agesquared <- coords(roc.agesquared, transpose = FALSE)
  162. roc.df.agesquared$model <- "agesquared"
  163. head(roc.df.agesquared)
  164.  
  165.  
  166. ## for AIC model
  167. (roc.AIC <- roc(hosp ~ p.AIC, data = pred.phat))
  168. # save the coordinates in a data frame for plotting.
  169. roc.df.AIC <- coords(roc.AIC, transpose = FALSE)
  170. roc.df.AIC$model <- "AIC"
  171. head(roc.df.AIC)
  172.  
  173.  
  174.  
  175. ## for BIC2 model
  176. (roc.BIC2 <- roc(hosp ~ p.BIC2, data = pred.phat))
  177. # save the coordinates in a data frame for plotting.
  178. roc.df.BIC2 <- coords(roc.BIC2, transpose = FALSE)
  179. roc.df.BIC2$model <- "BIC2"
  180. head(roc.df.BIC2)
  181.  
  182.  
  183. ## for BIC3 model
  184. (roc.BIC3 <- roc(hosp ~ p.BIC3, data = pred.phat))
  185. # save the coordinates in a data frame for plotting.
  186. roc.df.BIC3 <- coords(roc.BIC3, transpose = FALSE)
  187. roc.df.BIC3$model <- "BIC3"
  188. head(roc.df.BIC3)
  189.  
  190.  
  191.  
  192. roc.df <- rbind(roc.df.health, roc.df.agesquared, roc.df.AIC, roc.df.BIC2, roc.df.BIC3 )
  193.  
  194.  
  195.  
  196. # Create the data for the Ideal model by hand:
  197. roc.df.ideal <- data.frame(sensitivity = c(0, 1, 1),
  198.                            specificity = c(1, 1, 0),
  199.                            threshold = c(NA, NA, NA))
  200. roc.df.ideal$model <- "ideal"
  201.  
  202.  
  203.  
  204. ggplot(roc.df, aes(specificity, sensitivity,
  205.                    color = model)) +
  206.   geom_path(size = 1) +
  207.   coord_fixed() +       # square plotting area
  208.   scale_x_reverse() +   # Reverse scale on the x-axis!
  209.   labs(title = "ROC-curves for all the models") +
  210.   theme(text = element_text(size = 14))
  211.  
  212.  
  213.  
  214.  
  215.  
  216.  
  217. #Collect AUC and intervals for all the models:
  218. (aucs <-
  219.     data.frame(
  220.       model = c ("health", "agesquared", "AIC", "BIC2", "BIC3"),
  221.       auc = c(auc(roc.health), auc(roc.agesquared), auc(roc.AIC), auc(roc.BIC2),
  222.               auc(roc.BIC3)),
  223.       lwr = c(ci(roc.health)[1],
  224.               ci(roc.agesquared)[1],
  225.               ci(roc.AIC)[1], ci(roc.BIC2)[1],
  226.               ci(roc.BIC3)[1]),
  227.       upr = c(ci(auc(roc.health))[3], ci(auc(roc.agesquared))[3],
  228.               ci(auc(roc.AIC))[3], ci(auc(roc.BIC2))[3],
  229.               ci(auc(roc.BIC3))[3])))
  230.  
  231.  
  232. # Compare the AUC for the models:
  233. roc.test(roc.AIC, roc.health)
  234. roc.test(roc.AIC, roc.agesquared)
  235. roc.test(roc.AIC, roc.BIC2)
  236. roc.test(roc.AIC, roc.BIC3)
  237.  
  238.  
  239.  
  240. ####
  241. ####   (c)
  242. ###
  243.  
  244. # experiment with different values of levels for sens and spec, level.ss, to find
  245. # the one that gives the optimal combination of sens and spec.
  246. level.ss <- 0.6255
  247. roc.df.AIC[roc.df.AIC$sensitivity > level.ss &
  248.              roc.df.AIC$specificity > level.ss, ]
  249. ##
  250. (I_max.AIC <- which(roc.df.AIC$sensitivity > level.ss &
  251.                       roc.df.AIC$specificity > level.ss))
  252. roc.df.AIC[I_max.AIC, ]
  253. # Pick out the corresponding threshold for p:
  254. roc.df.AIC[I_max.AIC, "threshold"]
  255.  
  256.  
  257.  
  258. (pred.phat$yhat.AIC <- as.numeric(pred.phat$p.AIC > roc.df.AIC[I_max.AIC, "threshold"]))
  259. # pred.phat$yhat.oslo <- as.numeric(pred.phat$p.oslo > 0.5)
  260.  
  261. (row.01 <- table(hospital$hosp))
  262.  
  263. # the confiusion matrix is a confiusion vectore just ad [1 0 0]' at the end
  264. (col.01.AIC <- table(pred.phat$yhat.AIC))
  265. (confusion.AIC <- table(pred.phat$hosp, pred.phat$yhat.AIC))
  266. (spec.AIC <- confusion.AIC[1, 1] / (confusion.AIC[1, 1]+confusion.AIC[1, 2]) )
  267. (sens.AIC <- confusion.AIC[2, 2] / (confusion.AIC[2, 1]+confusion.AIC[2, 2]) )
  268. (accu.AIC <- sum(diag(confusion.AIC)) / sum(confusion.AIC))
  269. (prec.AIC <- confusion.AIC[2, 2] / (confusion.AIC[1, 2]+confusion.AIC[2, 2])  )
  270.  
  271.  
  272.  
  273.  
  274.  
  275.  
  276. # Hosmer-Lemeshow-test####
  277. # Illustrating example: plot in sorted p-order
  278. # order(variable) gives the ranks for the values in variable.
  279. # It can then be used to sort the data frame:
  280. pred.sort <- pred.phat[order(pred.phat$p.AIC), ]
  281. pred.sort$rank <- seq(1, nrow(pred.sort))
  282. head(pred.sort)
  283.  
  284.  
  285.   # Divide the n=500 observations into g=10 groups:
  286.   n <- nrow(pred.sort)
  287.   g <- 10
  288.   # with ng = 50 observations each:
  289.   ng <- n/g
  290.  
  291.   # Plot p_i and Y_i
  292.   # Add i vertical jitter to Y_i to separate them
  293.   ggplot(pred.sort, aes(rank, p.AIC)) +
  294.     geom_point() +
  295.     geom_jitter(aes(y = hosp), height = 0.01) +
  296.     geom_vline(xintercept = seq(ng, nrow(pred.sort) - ng, ng)) +
  297.     labs(title = "Model 3: Estimated probabilities by increasing size",
  298.          caption = "g = 10 groups",
  299.          x = "(i) = 1,...,n", y = "p-hat") +
  300.     theme(text = element_text(size = 14))
  301.  
  302.  
  303.  
  304.   # HL by hand####
  305.   # The following can be done using the output of the
  306.   # hoslem.test function, see below.
  307.   # I only do it here to illustrate the steps.
  308.  
  309.   # A for-loop to set the group numbers:
  310.   pred.sort$group <- NA
  311.   for (k in seq(1, g)) {
  312.     I <- (k - 1)*ng + seq(1, ng)
  313.     pred.sort$group[I] <- k
  314.   }
  315.   head(pred.sort)
  316.  
  317.   # Calculate Observed and Expected in each group:
  318.   # aggregate(y ~ x, FUN = mean) calculates the mean of y
  319.   # separately for each group.
  320.   # merge(data1, data2) joins two data frames using any common
  321.   # variables as keys, in this case, "group".
  322.  
  323.   # Number of successes:
  324.   OE1 <- merge(aggregate(hosp ~ group, data = pred.sort, FUN = sum),
  325.                aggregate(p.AIC ~ group, data = pred.sort, FUN = sum))
  326.   OE1
  327.   # Number of failures = n_g - successes:
  328.   OE0 <- OE1
  329.   OE0$hosp <- ng - OE1$hosp
  330.   OE0$p.AIC <- ng - OE1$p.AIC
  331.   # A variable to use for color coding:
  332.   OE1$outcome <- "Y = 1"
  333.   OE0$outcome <- "Y = 0"
  334.   # Bind the two data sets as rows (r):
  335.   (OE <- rbind(OE1, OE0))
  336.  
  337.  
  338.  
  339.  
  340.   # And plot:
  341.   # Set the tickmarks on the x-axis to integers 1,...,g
  342.   # Note the linetype inside the aes() to set different
  343.   # linetype to O and E automatically
  344.  
  345.   ggplot(OE, aes(group, p.AIC, color = outcome)) +
  346.     geom_line(aes(linetype = "expected"), size = 1) +
  347.     geom_line(aes(y = hosp, linetype = "observed"), size = 1) +
  348.     labs(title = "Model AIC: Observed and expected in each group",
  349.          y = "number of observations") +
  350.     theme(text = element_text(size = 14)) +
  351.     scale_x_continuous(breaks = seq(1, g))
  352.  
  353.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement