Untitled

dataframe = read.csv(file="C:/Users/micha/Downloads/happiness2017.csv", header=TRUE)
master_dataframe <- dataframe
set.seed(0897)
dataframe = dataframe[complete.cases(dataframe),] #Removing countries with missing values
dataframe = as.matrix(dataframe[2:11]) #Removing Countries Column
dataframe <- dataframe[sample(nrow(dataframe), 100), ] #Taking Sample
#Multivariate ScatterPlot
library(car)
library(Rfast)
scatterplotMatrix(dataframe, main= "Matrix ScatterPlot | Eric Lee, Michael Du")

#Summary Statistics
colMeans(dataframe) #Means
colVars(dataframe) #Var
#We probably need other summ statistics but i dont know which ones? :O


#calculating outliers
ncol(dataframe)
S = cov(dataframe)
for(j in 1:ncol(dataframe)){
  xbar = mean(dataframe[,j])
  for(i in 1:nrow(dataframe)){
    current_value = (dataframe[i,j])
    standardized_value = (current_value - xbar)/sqrt(S[j, j])
    str <- paste("(", current_value, "-", xbar, ")", "/", "sqrt(",S[j,j], ")")
    #print(str)
    #print(standardized_value)
    if (standardized_value > 3.5){
      outlier_string <- paste("[", i, ",", j, "]", "is an outlier! value =", standardized_value)
      print(outlier_string)
    }
  }
}

#No outliers omegalol, good work for nothing. Closest has value 3.427 < 3.5 at [49,6]
#Next, formal way of checking for Multivariate Normality
S_inv = solve(S)
d_vector = c()
for(i in 1:nrow(dataframe)){
  xi = dataframe[i,]
  for(j in 1:ncol(dataframe)){
    xbar = mean(dataframe[,j])
    xi[j] = xi[j] - xbar
  }
  d_value = t(xi) %*% S_inv %*% xi
  print(d_value)
  d_vector <- c(d_vector, d_value)
}
d_vector <- sort(d_vector)
#getting chisq quantiles
chisq_vector = c()
for(i in 1:nrow(dataframe)){
  chisq_value = qchisq(p=(i-1/2)/nrow(dataframe), df=ncol(dataframe)-1)
  chisq_vector <- c(chisq_vector, chisq_value)
}
plot(d_vector, chisq_vector, xlab=expression("d"[i]^2), ylab="Chi-squared Quantiles", main="Checking Joint Normality | Eric Lee, Michael Du")
abline(a = 0, b = 1, col="red")
#Looks about normal! yay :)

#linear model
dataframe <- as.data.frame(dataframe)
lin_model <- lm(Ladder ~ LogGDP + Social + HLE + Freedom + Generosity + Corruption + Positive + Negative + gini,
                data = dataframe)
summary(lin_model)

#PC Regression Model
data <- dataframe[,2:10]
data.bar = apply(data, 2, mean)
Z = data
for(i in 1:6){
  Z[,i] = (data[,i]-data.bar[i])/sqrt(diag(S)[i])
}
R = cor(data)
Val.new = eigen(R)$values
round(Val.new ,2)

Vec.new = eigen(R)$vectors
rownames(Vec.new) = colnames(data)
colnames(Vec.new) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
round(Vec.new ,2)

# Obtain sample PC values:

W.new = data  # create matrix same size as data
colnames(W.new) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
Z = as.matrix(Z)
for(i in 1:9){ # PC's
  for(j in 1:100){ #of rows
    W.new[j,i] = Vec.new[,i] %*% Z[j,]
  }}

plot(data.frame(W.new))
round( cor(W.new),3)

# How many components should we keep?
#######################################
# screeplot:

plot(Val.new, type="b", pch=19, xlab="",ylab="Variances")  # suggests keeping the first 4 or 5 PCs.

# Proportion of variation explained by each PC:

round( Val.new/sum(Val.new),3)

# Regression with all standardized PCs as the explanatory variables
##############################################################################

PC.model.new.1 = lm(dataframe$Ladder ~  W.new[,1] + W.new[,2] + W.new[,3] + W.new[,4] + W.new[,5]+ W.new[,6]+ W.new[,7]+ W.new[,8]+ W.new[,9])
summary(PC.model.new.1)  # W4, W5, W7, and W9 seem not to be significant

# Let's remove these

PC.model.new.2 = lm(dataframe$Ladder ~  W.new[,1] + W.new[,2] + W.new[,3]+ W.new[,6]+ W.new[,8])
summary(PC.model.new.2)
vif(PC.model.new.2)

# check the importance of each variable in standardized PCs:

round(Vec.new[,1],3)   #see reference in discord
round(Vec.new[,2],3)
round(Vec.new[,3],3)
round(Vec.new[,4],3)
round(Vec.new[,5],3)
round(Vec.new[,6],3)
round(Vec.new[,7],3)
round(Vec.new[,8],3)
round(Vec.new[,9],3)

# the correlations between PCs and variables also indicate importance:

cor.WX = function(mat){
  vals= mat
  rownames(vals) = paste("X", 1:nrow(mat), sep="")
  colnames(vals) = paste("W", 1:nrow(mat), sep="")

  for(i in 1: ncol(vals)){
    val= t(eigen(mat)$vectors[,i])* sqrt( eigen(mat)$values[i])
    vals[,i] = val/sqrt(diag(mat))
  }
  return(vals)

}
cor.WX(R)