Untitled


# plotting
library(ggplot2)
# Necessary for multinomial regression:
library(nnet)

rm(list=ls())

plasma <- read.delim("plasma.txt")
head(plasma)
summary(plasma)

plasma$sex <- factor(plasma$sex,
                      levels = c(1, 2),
                      labels = c("Male", "Female"))
plasma$smokstat <- factor(plasma$smokstat,
                           levels = c(1, 2, 3),
                           labels = c("Never smoked", "Former smoker", "Current smoker"))
plasma$bmicat <- factor(plasma$bmicat,
                         levels = c(1, 2, 3, 4),
                         labels = c("Underweight", "Normal weight", "Overweight", "Obese"))
plasma$vituse <- factor(plasma$vituse,
                         levels = c(1, 2, 3),
                         labels = c("Frequently", "Not_frequently", "Never"))

#low category
(I_plasma_low <- as.numeric(plasma$betaplasma < 75))
sum(I_plasma_low)

#medium category
(I_plasma_med <- as.numeric(plasma$betaplasma >= 75 & plasma$betaplasma < 300))
sum(I_plasma_med)

#high category
(I_plasma_high <- as.numeric(plasma$betaplasma >= 300))
sum(I_plasma_high)

#changing reference categories
plasma$bmicat <- relevel(plasma$bmicat, "Normal weight")
plasma$sex <- relevel(plasma$sex, "Female")
plasma$smokstat <- relevel(plasma$smokstat, "Never smoked")
plasma$vituse <- relevel(plasma$vituse, "Never")

############# new stuff
#adding the values
plasma$betaplasma_cat <- I_plasma_low + 2*I_plasma_med + 3*I_plasma_high

#changing 1, 2, 3 to low, medium, high
plasma$betaplasma_cat <- factor(plasma$betaplasma_cat,
                        levels = c(1, 2, 3),
                        labels = c("low", "medium", "high"))

#relevel to medium being reference category
plasma$betaplasma_cat <- relevel(plasma$betaplasma_cat, ref="medium")

#presenting table
table(plasma$betaplasma_cat)

#############


#####Null and full model for the step models
# null model for comparisons####
(model.null <- multinom(betaplasma_cat ~ 1, data = plasma))
(sum.null <- summary(model.null))

# full model for upper limit in stepwise####
(model.full <- multinom(betaplasma_cat ~ age + sex + smokstat + quetelet + bmicat + vituse + calories + fat, data = plasma))
(sum.full <- summary(model.full))


############# EVERYTHING FOR AIC AS CRITERION

#####
##### Stepwise procedure using AIC####
model.AIC <- step(model.null,
                    scope = list(upper = model.full),
                    direction = "both")
(sum.AIC <- summary(model.AIC))

####### TEST for the AIC model -- beta values, standard error, z-value, P-value
(beta.AIC <- sum.AIC$coefficients)
(se.beta.AIC <- sum.AIC$standard.errors)
(z.value.AIC <- beta.AIC/se.beta.AIC)
(P.value.AIC <- pnorm(abs(z.value.AIC), lower.tail = FALSE))

#odds ratios for the AIC model####
(OR.AIC <- exp(beta.AIC))
OR.AIC["low", ]
OR.AIC["high", ]

# Confidence intervals for OR, AIC model:
ci.AIC <- exp(confint(model.AIC))
# a 3-dimensional matrix!
ci.AIC


## beta values, P values, odds ratio as well as confidence interval for the odds ratio
(AIC.test.beta <- cbind(
  beta = round(beta.AIC["low", ], digits = 2),
  P.value = round(P.value.AIC["low", ], digits = 3),
  Odds_Ratio = round(OR.AIC["low", ], digits = 2),
  round(ci.AIC[, , "low"], digits = 2)))


#####
##### Stepwise procedure using BIC####
model.BIC <- step(model.null,
                     scope = list(upper = model.full, lower = model.null),
                     direction = "both",
                     k = log(nrow(plasma)))
(sum.BIC <- summary(model.BIC))
model.grades

####### TEST for the BIC model -- beta values, standard error, z-value, P-value
(beta.BIC <- sum.BIC$coefficients)
(se.beta.BIC <- sum.BIC$standard.errors)
(z.value.BIC <- beta.BIC/se.beta.BIC)
(P.value.BIC <- pnorm(abs(z.value.BIC), lower.tail = FALSE))

#odds ratios for the AIC model####
(OR.BIC <- exp(beta.BIC))
OR.BIC["low", ]
OR.BIC["high", ]

# Confidence intervals for OR, AIC model:
ci.BIC <- exp(confint(model.BIC))
# a 3-dimensional matrix!
ci.BIC


## beta values, P values, odds ratio as well as confidence interval for the odds ratio
(BIC.test.beta <- cbind(
  beta = round(beta.BIC["low", ], digits = 2),
  P.value = round(P.value.BIC["low", ], digits = 3),
  Odds_Ratio = round(OR.BIC["low", ], digits = 2),
  round(ci.BIC[, , "low"], digits = 2)))


###################

###Likelihood ratio test for the AIC model
anova(update(model.AIC, . ~ . - quetelet), model.AIC)
anova(update(model.AIC, . ~ . - vituse), model.AIC)
anova(update(model.AIC, . ~ . - age), model.AIC)
anova(update(model.AIC, . ~ . - sex), model.AIC) #[2,"Pr(Chi)"]

###Likelihood ratio test for the BIC model
anova(update(model.BIC, . ~ . - quetelet), model.BIC)
anova(update(model.BIC, . ~ . - vituse), model.BIC)


###################
#PLOTTING AIC
######### AVERAGE AGE, VARYING SEX, VARYING BMI
# range of betaplasma levels, repeated twice,
# for public and private school
x0 <- data.frame(quetelet = rep(seq(16, 51), 2))
# the same ses, socst and science for all:
#x0$vituse <- "Never"
x0$vituse <- "Frequently"
x0$age <- mean(plasma$age)
x0$sex <- c(rep("Female", 51 - 16 + 1),
               rep("Male", 51 - 16 + 1))


pred.x0 <- cbind(
  x0,
  predict(model.AIC, newdata = x0, type = "probs"))
head(pred.x0)


ggplot(pred.x0, aes(x = quetelet)) +
  geom_line(aes(y = low, color = "3. Low"), size = 2) +
  geom_line(aes(y = medium, color = "2. Medium"), size = 2) +
  geom_line(aes(y = high, color = "1. High"), size = 2) +
  expand_limits(y = c(0, 1)) +
  facet_grid(~ sex) +
  labs(title = "Multinomial: Average age with varying sex and BMI, frequent vituse, for AIC model",
       y = "probability",
       fill = "Program") +
  theme(text = element_text(size = 14))

ggplot(pred.x0, aes(x = quetelet)) +
  geom_ribbon(aes(ymin = 0, ymax = low, fill = "3. Low")) +
  geom_ribbon(aes(ymin = low, ymax = low + medium,
                  fill = "2. Medium")) +
  geom_ribbon(aes(ymin = low + medium, ymax = 1,
                  fill = "1. High")) +
  facet_grid(~ sex) +
  labs(title = "Multinomial: Average age with varying sex and BMI, frequent vituse, for AIC model",
       y = "probability",
       fill = "Level of concentration") +
  theme(text = element_text(size = 14))


###################
#PLOTTING BIC
######### AVERAGE AGE, VARYING SEX, VARYING BMI, FREQUENT VITUSE
# range of betaplasma levels, repeated twice,
# for public and private school
x0.BIC <- data.frame(quetelet = rep(seq(16, 51), 3))
# the same ses, socst and science for all:
x0.BIC$vituse <- c(rep("Frequently", 51 - 16 + 1),
            rep("Not_frequently", 51 - 16 + 1),
            rep("Never", 51 - 16 + 1))


pred.x0.BIC <- cbind(
  x0.BIC,
  predict(model.BIC, newdata = x0.BIC, type = "probs"))
head(pred.x0.BIC)


ggplot(pred.x0.BIC, aes(x = quetelet)) +
  geom_line(aes(y = low, color = "3. Low"), size = 2) +
  geom_line(aes(y = medium, color = "2. Medium"), size = 2) +
  geom_line(aes(y = high, color = "1. High"), size = 2) +
  facet_grid(~ vituse) +
  expand_limits(y = c(0, 1)) +
  labs(title = "Multinomial: Varying vituse and BMI, for BIC model",
       y = "probability",
       fill = "Program") +
  theme(text = element_text(size = 14))

ggplot(pred.x0.BIC, aes(x = quetelet)) +
  geom_ribbon(aes(ymin = 0, ymax = low, fill = "3. Low")) +
  geom_ribbon(aes(ymin = low, ymax = low + medium,
                  fill = "2. Medium")) +
  geom_ribbon(aes(ymin = low + medium, ymax = 1,
                  fill = "1. High")) +
  facet_grid(~ vituse) +
  labs(title = "Multinomial: Varying vituse and BMI, for BIC model",
       y = "probability",
       fill = "Level of concentration") +
  theme(text = element_text(size = 14))

################################################
######LIKELIHOOD RATIO TEST FOR THE MODELS######
################################################

# LR (deviance) tests####
# AIC against null model
anova(model.null, model.AIC)
# BIC against null model
anova(model.null, model.BIC)
# AIC against BIC model
anova(model.BIC, model.AIC)
# AIC against full model
anova(model.AIC, model.full)
# BIC against full model
anova(model.BIC, model.full)

#AIC and BIC for the models
info <- cbind(aic = AIC(model.null, model.AIC, model.BIC, model.full),
              bic = BIC(model.null, model.AIC, model.BIC, model.full))
#R^2 for the models (McF?)
info$r2 <- round(100*c(
  0,
  1 - model.AIC$deviance/model.null$deviance,
  1 - model.BIC$deviance/model.null$deviance,
  1 - model.full$deviance/model.null$deviance), digits = 1)
#R^2 adjusted for the models (McF?)
info$r2.adj <- round(100*c(
  0,
  1 - (model.AIC$deviance + (model.AIC$edf - model.null$edf))/model.null$deviance,
  1 - (model.BIC$deviance + (model.BIC$edf - model.null$edf))/model.null$deviance,
  1 - (model.full$deviance + (model.full$edf - model.null$edf))/model.null$deviance),
  digits = 1)

#The AIC, BIC, R^2 and R^2 adjusted for all models
(info)


################################################
######CONFUSION MATRICES FOR THE AIC MODEL######
################################################

# estimated probs####
predict(model.AIC, type = "prob")
# predicted category
predict(model.AIC)
predict(model.AIC, type = "class")

pred.final.AIC <- cbind(
  plasma,
  predict(model.AIC, type = "probs"),
  yhat = predict(model.AIC))
head(pred.final.AIC)

# confusion matrix####
table(pred.final.AIC$betaplasma_cat)
table(pred.final.AIC$yhat)

(conf.final <- table(pred.final.AIC$betaplasma_cat, pred.final.AIC$yhat))
# Sensitivities:
sens.AIC <- round(100*diag(conf.final)/table(pred.final.AIC$betaplasma_cat), digits = 1)
# precisions:
prec.AIC <-round(100*diag(conf.final)/table(pred.final.AIC$yhat), digits = 1)
# accuracy:
acc.AIC <- 100*sum(diag(conf.final))/sum(conf.final)

(Conf.stats.table.AIC <- data.frame(value = c("Medium", "Low", "High"), Sens_Spec = as.numeric(sens.AIC), Prec = as.numeric(prec.AIC), Acc = as.numeric(acc.AIC)))


################################################
######CONFUSION MATRICES FOR THE BIC MODEL######
################################################


# estimated probs####
predict(model.BIC, type = "prob")
# predicted category
predict(model.BIC)
predict(model.BIC, type = "class")

pred.final.BIC <- cbind(
  plasma,
  predict(model.BIC, type = "probs"),
  yhat = predict(model.BIC))
head(pred.final.BIC)

# confusion matrix####
table(pred.final.BIC$betaplasma_cat)
table(pred.final.BIC$yhat)

(conf.final.BIC <- table(pred.final.BIC$betaplasma_cat, pred.final.BIC$yhat))
# Sensitivities:
sens.BIC <- round(100*diag(conf.final.BIC)/table(pred.final.BIC$betaplasma_cat), digits = 1)
# precisions:
prec.BIC <-round(100*diag(conf.final.BIC)/table(pred.final.BIC$yhat), digits = 1)
# accuracy:
acc.BIC <- 100*sum(diag(conf.final.BIC))/sum(conf.final.BIC)

(Conf.stats.table.BIC <- data.frame(value = c("Medium", "Low", "High"), Sens_Spec = as.numeric(sens.BIC), Prec = as.numeric(prec.BIC), Acc = as.numeric(acc.BIC)))