Untitled

---
title: "Statistical Computing Assignment 1"
author: "Ruth Turpie s1643103"
date: "22/02/2020"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# importing needed libraries
library(ggplot2)
library(readr)
suppressPackageStartupMessages(library(knitr))
filament <- read_csv("filament.csv")

```
Question 1

In Figure 1.1 the CAD vs Actual Weight Data has been plotted with matching colours for each material. The black line represents the line y = x, which we would expect to pass through all the points if the CAD weight was equal to the Actual weight. However in the plot the data points lie largely above y = x, hence it appears that the actual weight will tend to be greater than the CAD weight.

```{r}
# attaching the data base to allow the objects inside to be accessed easily
attach(filament)

# also seperating filament data base by class for easier access later
filament_obs <- subset(filament, Class == "obs")
filament_test <- subset(filament, Class == "test")

# a set of colours that will match up with the colours in the material data base column
# used throughout this report in later plots
# note magenta is quite light to help differentiate it from the neon pink
material_colour <- c("black", "#8bb565", "#f693bf", "#01aae1", "#e03fd8", "red")

# using gg plot to plot CAD vs Actual Weight
ggplot(Data = filament) +
geom_point(aes(x = CAD_Weight, y = Actual_Weight, col = Material)) +
geom_abline(slope = 1, size = 0.25) +            # adds line y = x
scale_color_manual(values = material_colour) +
ggtitle("Figure 1.1. CAD Weight vs Actual Weight") +
labs(x = "CAD Weight, grams", y = "Actual Weight, grams") +
theme(plot.title = element_text(hjust = 0.5))
```

Question 2

Defining a function model estimate that takes the parameters formulas, data, and
response, where formulas and data have the same interpretation as the
input arguments to model Z, and response is a character/string variable
naming the response (measured outcome) variable column of the data. The function estimates the model by numerical optimisation, and returns a list with three named elements: theta, formulas, and Sigma theta,
containing the information needed to call the model predict function.


```{r}

# model_Z is just from CWA2020code.r
model_Z <- function(formulas, data) {
  ZE <- model.matrix(formulas$E, data = data)
  ZV <- model.matrix(formulas$V, data = data)
  list(ZE = cbind(ZE, ZV * 0), ZV = cbind(ZE * 0, ZV))
}

# neg_log_lik is just from CWA2020code.r
neg_log_lik <- function(theta, Z, y) {
  -sum(dnorm(y, mean = Z$ZE %*% theta, sd = exp(Z$ZV %*% theta)^0.5, log = TRUE))
}

# model_estimate() estimates a model using numerical optimisation using BFGS method

# Input:
#   response: the string variable naming the response variable column of the data
#   formulas: a list of formulas, of the kind needed by model_Z()
#   data: covariate information, in whatever format the model_Z() wants it.
# Output:
#   A list with entries 'formulas', 'theta_hat', 'sigma_theta',
#   as required by the model predict function
model_estimate <- function(formulas, data, response) {
  Z <- model_Z(formulas, data)
  opt <- optim(par = rep(0,ncol(Z$ZE)),
               fn = neg_log_lik, Z = Z, y = data[[response]],
               method = "BFGS",
               control = list(maxit = 5000),
               hessian = TRUE)
  theta_hat <- opt$par
  Sigma_theta <- solve(opt$hessian)
  return(list("formulas" = formulas, "theta_hat" = theta_hat, "Sigma_theta" = Sigma_theta))
}

```

Question 3

Estimating a model for Actual Weight, with an intercept and covariate
CAD Weight for the model expectations, and an intercept for the
model (log-)variances using the observed data. This model will from this point be known as "Model 1".

```{r}

# E is the formula for the expectation with an intercept and covariate
# V is the formula for the varience with an intercept only
formulas_model1 <- c("E" = ~ 1 + CAD_Weight,"V" = ~ 1)

# estimating the model for actual weight using the observed data
estimate_model1 <-model_estimate(formulas_model1, subset(filament, Class == 'obs'), "Actual_Weight")
```

Question 4

Figure 4.1 shows the test data together with the prediction intervals for the estimated Model 1. This is to analyse how good the estimated model ,which was generated using observed data, is at predicting correctly the test data. A value of alpha = 0.1 has been used to make fairer comparisons in later models when alpha = 0.1.

This figure appears to show that the prediction interval in general seems to be fairly accurate (it at least follows path slightly above y = x implying that most actual weights are slightly bigger than their CAD weights as desired). However the test data points at high CAD weights, or at least materials with high CAD weights, lie significantly far outwith the prediction interval, suggesting that this estimated model is not good at predicting test data for high CAD weights.

```{r}
# model_predict is just from CWA2020code.r
model_predict <- function(theta, formulas, Sigma_theta = NULL,
                          newdata,
                          type = c("expectation", "log-variance", "observation"),
                          alpha = 0.05, df = Inf,
                          nonlinear.correction = TRUE) {
  type <- match.arg(type)
  Z <- model_Z(formulas, data = newdata)
  fit_E <- Z$ZE %*% theta
  fit_V <- Z$ZV %*% theta
  if (is.null(Sigma_theta)) {
    ZE_var <- 0
    ZV_var <- 0
  } else {
    ZE_var <- rowSums(Z$ZE * (Z$ZE %*% Sigma_theta))
    ZV_var <- rowSums(Z$ZV * (Z$ZV %*% Sigma_theta))
  }
  if (type == "expectation") {
    fit <- fit_E
    sigma <- ZE_var^0.5
  } else if (type == "log-variance") {
    fit <- fit_V
    sigma <- ZV_var^0.5
  } else if (type == "observation") { ## observation predictions
    fit <- fit_E
    sigma <- (exp(fit_V + ZV_var / 2 * nonlinear.correction) + ZE_var)^0.5
  }
  q <- qt(1 - alpha / 2, df = df)
  lwr <- fit - q * sigma
  upr <- fit + q * sigma
  data.frame(mu = fit, sigma, lwr, upr)
}

predict_model1 <- model_predict(estimate_model1$theta_hat, estimate_model1$formulas,  Sigma_theta = estimate_model1$Sigma_theta, subset(filament, Class == 'test'), alpha = 0.1, type = "observation")

ggplot(cbind(filament_test,predict_model1)) +
  geom_ribbon(
              aes(CAD_Weight, ymin = lwr, ymax = upr),
              alpha = 0.5, fill = "#72edd0") +
  geom_point(aes(y = Actual_Weight, x = CAD_Weight,col = Material)) +
  ggtitle("Figure 4.1: Prediction Intervals for Model 1 with Test Data Points") +
  scale_color_manual(values = material_colour) +
  labs(x = "CAD Weight, grams", y = "Actual Weight, grams") +
  geom_abline(slope = 1, size = 0.25) +            # adds line y = x
  theme(plot.title = element_text(hjust = 0.5))


```


Question 5

Estimating a model for Actual Weight, with an intercept and covariate
CAD Weight for the model expectations, and an also an intercept and covariate CAD Weight for the
model (log-)variances using the observed data. This model will from this point be known as "Model 2".

```{r}

# E is the formula for the expectation with an intercept and covariate
# V is the formula for the varience with an intercept and covariate
formulas_model2 <- c("E" = ~ 1 + CAD_Weight,"V" = ~ 1 + CAD_Weight)

# estimating the model for actual weight using the observed data
estimate_model2 <-model_estimate(formulas_model2, filament, "Actual_Weight")

```
Question 6

Figure 6.1 shows the test data together with the prediction intervals for the estimated Model 2, and will be analysed in a similiar fashion to that of Model 1 in Question 4. A value of alpha = 0.1 has once again been used to make fairer comparisons in later models when alpha = 0.1.

This model takes into account the covariate CAD Weight for the model (log-)variances, which appears to widen the prediction interval for higher CAD weights and slim it for low CAD weights. This seems to predict the test data quite well, as the points lie far more within the ribbon representing the prediction interval. At lower CAD weights, the test points do not seem to deviate far from y = x and are "easier" to predict, meaning that the prediction interval can be narrow, whilst the opposite is true for the higher CAD weights. This analysis alone appears to show Model 2 may be a better model than Model 1.

```{r}

predict_model2 <- model_predict(estimate_model2$theta_hat, estimate_model2$formulas,  Sigma_theta = estimate_model2$Sigma_theta, subset(filament, Class == 'test'), alpha = 0.1, type = "observation")

ggplot(cbind(filament_test,predict_model2)) +
  geom_ribbon(
              aes(CAD_Weight, ymin = lwr, ymax = upr),
              alpha = 0.5, fill = "#72edd0") +
  geom_point(aes(y = Actual_Weight, x = CAD_Weight,col = Material)) +
  ggtitle("Figure 6.1: Prediction Intervals for Model 2 with Test Data Points") +
  scale_color_manual(values = material_colour) +
  labs(x = "CAD Weight, grams", y = "Actual Weight, grams") +
  geom_abline(slope = 1, size = 0.25) +            # adds line y = x
  theme(plot.title = element_text(hjust = 0.5))

```


Question 7

```{r}

# The following 3 functions are just the score functions from CWA2020
# They compute the  Squared Error, Dawid-Sebastiani, and Interval scores respectively

score_se <- function(pred, y) {
  (y - pred$mu)^2
}

score_ds <- function(pred, y) {
  ((y - pred$mu) / pred$sigma)^2 + 2 * log(pred$sigma)
}

score_interval <- function(pred, y, alpha = 0.1) {
  L <- pred$lwr
  U <- pred$upr
  U - L + 2 / alpha * (pmax(0, L - y) + pmax(0, y - U))
}

# model_scores returns a list of score value vectors for a given model prediction
# Input:
#   pred: data.frame with (at least) columns 'mu' and 'sigma'
#   y: data vector
# Output:
#   A list of score value vectors
model_scores <- function(pred, y){
  return(list("score_se" = score_se(pred,y), "score_ds" = score_ds(pred,y), "score_interval" = score_interval(pred,y)))
}

# finding the scores for models 1 and 2
scores_model1 <-model_scores(predict_model1, filament_test$Actual_Weight)
scores_model2 <-model_scores(predict_model2, filament_test$Actual_Weight)
print(scores_model1[[1]])
print(scores_model2[[1]])

# the pairwise differences between the scores for Model 2 - Model 1
scores_diff2_1 <- mapply('-',scores_model2,scores_model1,SIMPLIFY=FALSE)
print(scores_diff2_1[[1]])

df <- data.frame(t(data.frame(matrix(unlist(scores_diff2_1), nrow=length(scores_diff2_1), byrow=T),stringsAsFactors=FALSE)))
names(df)[1] <- "score_se"
names(df)[2] <- "score_ds"
names(df)[3] <- "score_interval"

# kable(df, caption = "Hello")

ggplot(df) +
  stat_ecdf(geom = "step", aes(df$score_se, colour = "Squared Error score")) +
  stat_ecdf(geom = "step", aes(df$score_ds, colour = "Dawid-Sebastiani Scores")) +
  stat_ecdf(geom = "step", aes(df$score_interval, colour = "Interval Score")) +
  scale_colour_manual("",
                      breaks = c("Squared Error score", "Dawid-Sebastiani Scores", "Interval Score"),
                      values = c("#32a852", "#326eab", "#b0337e"))

```
Question 8

```{r}

formulas_model3 <- c("E" = ~ 1 + Material:CAD_Weight,"V" = ~ 1 + CAD_Weight)

estimate_model3 <-model_estimate(formulas_model3, subset(filament, Class == 'obs'), "Actual_Weight")

predict_model3 <- model_predict(estimate_model3$theta_hat, estimate_model3$formulas,  Sigma_theta = estimate_model3$Sigma_theta, subset(filament, Class == 'test'), alpha = 0.1, type = "observation")

scores_model3 <-model_scores(predict_model3, filament_test$Actual_Weight)

scores_diff3_1 <- mapply('-',scores_model3,scores_model1,SIMPLIFY=FALSE)

scores_diff3_2 <- mapply('-',scores_model3,scores_model2,SIMPLIFY=FALSE)

df <- data.frame(t(data.frame(matrix(unlist(scores_diff3_1), nrow=length(scores_diff3_1), byrow=T),stringsAsFactors=FALSE)))
names(df)[1] <- "score_se"
names(df)[2] <- "score_ds"
names(df)[3] <- "score_interval"

# kable(df, caption = "Hello")

ggplot(df) +
  stat_ecdf(geom = "step", aes(df$score_se, colour = "Squared Error score")) +
  stat_ecdf(geom = "step", aes(df$score_ds, colour = "Dawid-Sebastiani Scores")) +
  stat_ecdf(geom = "step", aes(df$score_interval, colour = "Interval Score")) +
  scale_colour_manual("",
                      breaks = c("Squared Error score", "Dawid-Sebastiani Scores", "Interval Score"),
                      values = c("#32a852", "#326eab", "#b0337e"))

df <- data.frame(t(data.frame(matrix(unlist(scores_diff3_2), nrow=length(scores_diff3_1), byrow=T),stringsAsFactors=FALSE)))
names(df)[1] <- "score_se"
names(df)[2] <- "score_ds"
names(df)[3] <- "score_interval"

# kable(df, caption = "Hello")

ggplot(df) +
  stat_ecdf(geom = "step", aes(df$score_se, colour = "Squared Error score")) +
  stat_ecdf(geom = "step", aes(df$score_ds, colour = "Dawid-Sebastiani Scores")) +
  stat_ecdf(geom = "step", aes(df$score_interval, colour = "Interval Score")) +
  scale_colour_manual("",
                      breaks = c("Squared Error score", "Dawid-Sebastiani Scores", "Interval Score"),
                      values = c("#32a852", "#326eab", "#b0337e"))


```

Question 9

First, the code computes the probabilities from each of the models that Actual Weight >= 1.1*CAD Weight for each test data point. Figure 9.1 shows a graph of the CAD Weight against this probability for both Model 1 and Model 2. Figure 9.2 shows the same except for Model 3, and shows the impact of the material colours on these trends. With blue, green and black, the larger the CAD Weight the less likely for the Actual Weight to be 1.1 times bigger, while the opposite appears to be true for red. There is not very much data to show trends in colours for the other data points. Alternatively, it can be looked at without the lense of material, and the trend appears to be vaguely parabolic with a minimum at around CAD Weight = 37.5. This is suggesting that CAD Weight increasing until

```{r}

# using Model 1 compute probabilities for brier score function

prob_model1<- pnorm(1.1*filament_test$CAD_Weight , mean = predict_model1$mu, sd = predict_model1$sigma, lower.tail = FALSE)

# using Model 2 compute probabilities for brier score function

prob_model2 <- pnorm(1.1*filament_test$CAD_Weight , mean = predict_model2$mu, sd = predict_model2$sigma, lower.tail = FALSE)

# using Model 3 compute probabilities for brier score function

prob_model3 <- pnorm(1.1*filament_test$CAD_Weight , mean = predict_model3$mu, sd = predict_model3$sigma, lower.tail = FALSE)

# creating a list of all the probabilities for different models
probs <- data.frame(list("prob_model1" = prob_model1, "prob_model2" = prob_model2, "prob_model2" = prob_model2))

# plotting the probabilities for Model 1 and 2
ggplot(cbind(probs,filament_test)) +
  geom_line(aes(x = filament_test$CAD_Weight, y =probs[[1]])) +
  geom_line(aes(x = filament_test$CAD_Weight, y =probs[[2]]))

# plotting the probabilities for Model 3
ggplot(cbind(filament_test, probs)) +
  geom_line(aes(x = filament_test$CAD_Weight, y =probs[[3]], col = Material)) +
  geom_point(aes(x = filament_test$CAD_Weight, y =probs[[3]], col = Material)) +
  scale_color_manual(values = material_colour)

binary <- rep(0, length(subset(CAD_Weight, Class == 'test')))
for (i in 1:length(binary)) {
  if (subset(Actual_Weight, Class == 'test')[[i]] >= 1.1*subset(CAD_Weight, Class == 'test')[[i]]){
    binary[[i]] <- 1
  }
}

binary <- (subset(Actual_Weight, Class == 'test') >= 1.1*subset(CAD_Weight, Class == 'test'))

print(binary)
print(df)

brier <- function(p, z){
  return((z - p)^2)
}

brierq3 <- brier(probs$q3prob, binary)
brierq5 <- brier(probs$q5prob, binary)
brierq8 <- brier(probs$q8prob, binary)

```


Question 10

```{r}
nLL <- function(theta, y) {
  -sum(dcauchy(y, location = theta[[1]], scale = exp(theta[[2]]), log = TRUE))
}

test_data <- rcauchy(10000, location = 2, scale =5)

binary <- test_data <= 0

optq10 <- function(N){
  opt<- optim(par = c(0,0),
  fn = nLL, y = rcauchy(N, location = 2, scale = 5),
  method = "BFGS",
  control = list(maxit = 5000))
  return(list("location" = opt$par[[1]], "scale" = exp(opt$par[[2]])))
}

```