Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- dataframe = read.csv(file="C:/Users/micha/Downloads/happiness2017.csv", header=TRUE)
- master_dataframe <- dataframe
- set.seed(0897)
- dataframe = dataframe[complete.cases(dataframe),] #Removing countries with missing values
- dataframe = as.matrix(dataframe[2:11]) #Removing Countries Column
- dataframe <- dataframe[sample(nrow(dataframe), 100), ] #Taking Sample
- #Multivariate ScatterPlot
- library(car)
- library(Rfast)
- scatterplotMatrix(dataframe, main= "Matrix ScatterPlot | Eric Lee, Michael Du")
- #Summary Statistics
- colMeans(dataframe) #Means
- colVars(dataframe) #Var
- #We probably need other summ statistics but i dont know which ones? :O
- #calculating outliers
- ncol(dataframe)
- S = cov(dataframe)
- for(j in 1:ncol(dataframe)){
- xbar = mean(dataframe[,j])
- for(i in 1:nrow(dataframe)){
- current_value = (dataframe[i,j])
- standardized_value = (current_value - xbar)/sqrt(S[j, j])
- str <- paste("(", current_value, "-", xbar, ")", "/", "sqrt(",S[j,j], ")")
- #print(str)
- #print(standardized_value)
- if (standardized_value > 3.5){
- outlier_string <- paste("[", i, ",", j, "]", "is an outlier! value =", standardized_value)
- print(outlier_string)
- }
- }
- }
- #No outliers omegalol, good work for nothing. Closest has value 3.427 < 3.5 at [49,6]
- #Next, formal way of checking for Multivariate Normality
- S_inv = solve(S)
- d_vector = c()
- for(i in 1:nrow(dataframe)){
- xi = dataframe[i,]
- for(j in 1:ncol(dataframe)){
- xbar = mean(dataframe[,j])
- xi[j] = xi[j] - xbar
- }
- d_value = t(xi) %*% S_inv %*% xi
- print(d_value)
- d_vector <- c(d_vector, d_value)
- }
- d_vector <- sort(d_vector)
- #getting chisq quantiles
- chisq_vector = c()
- for(i in 1:nrow(dataframe)){
- chisq_value = qchisq(p=(i-1/2)/nrow(dataframe), df=ncol(dataframe)-1)
- chisq_vector <- c(chisq_vector, chisq_value)
- }
- plot(d_vector, chisq_vector, xlab=expression("d"[i]^2), ylab="Chi-squared Quantiles", main="Checking Joint Normality | Eric Lee, Michael Du")
- abline(a = 0, b = 1, col="red")
- #Looks about normal! yay :)
- #linear model
- dataframe <- as.data.frame(dataframe)
- lin_model <- lm(Ladder ~ LogGDP + Social + HLE + Freedom + Generosity + Corruption + Positive + Negative + gini,
- data = dataframe)
- summary(lin_model)
- #PC Regression Model
- data <- dataframe[,2:10]
- data.bar = apply(data, 2, mean)
- Z = data
- for(i in 1:6){
- Z[,i] = (data[,i]-data.bar[i])/sqrt(diag(S)[i])
- }
- R = cor(data)
- Val.new = eigen(R)$values
- round(Val.new ,2)
- Vec.new = eigen(R)$vectors
- rownames(Vec.new) = colnames(data)
- colnames(Vec.new) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
- round(Vec.new ,2)
- # Obtain sample PC values:
- W.new = data # create matrix same size as data
- colnames(W.new) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
- Z = as.matrix(Z)
- for(i in 1:9){ # PC's
- for(j in 1:100){ #of rows
- W.new[j,i] = Vec.new[,i] %*% Z[j,]
- }}
- plot(data.frame(W.new))
- round( cor(W.new),3)
- # How many components should we keep?
- #######################################
- # screeplot:
- plot(Val.new, type="b", pch=19, xlab="",ylab="Variances") # suggests keeping the first 4 or 5 PCs.
- # Proportion of variation explained by each PC:
- round( Val.new/sum(Val.new),3)
- # Regression with all standardized PCs as the explanatory variables
- ##############################################################################
- PC.model.new.1 = lm(dataframe$Ladder ~ W.new[,1] + W.new[,2] + W.new[,3] + W.new[,4] + W.new[,5]+ W.new[,6]+ W.new[,7]+ W.new[,8]+ W.new[,9])
- summary(PC.model.new.1) # W4, W5, W7, and W9 seem not to be significant
- # Let's remove these
- PC.model.new.2 = lm(dataframe$Ladder ~ W.new[,1] + W.new[,2] + W.new[,3]+ W.new[,6]+ W.new[,8])
- summary(PC.model.new.2)
- vif(PC.model.new.2)
- # check the importance of each variable in standardized PCs:
- round(Vec.new[,1],3) #see reference in discord
- round(Vec.new[,2],3)
- round(Vec.new[,3],3)
- round(Vec.new[,4],3)
- round(Vec.new[,5],3)
- round(Vec.new[,6],3)
- round(Vec.new[,7],3)
- round(Vec.new[,8],3)
- round(Vec.new[,9],3)
- # the correlations between PCs and variables also indicate importance:
- cor.WX = function(mat){
- vals= mat
- rownames(vals) = paste("X", 1:nrow(mat), sep="")
- colnames(vals) = paste("W", 1:nrow(mat), sep="")
- for(i in 1: ncol(vals)){
- val= t(eigen(mat)$vectors[,i])* sqrt( eigen(mat)$values[i])
- vals[,i] = val/sqrt(diag(mat))
- }
- return(vals)
- }
- cor.WX(R)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement