Advertisement
Guest User

Untitled

a guest
Mar 28th, 2020
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.45 KB | None | 0 0
  1. dataframe = read.csv(file="C:/Users/micha/Downloads/happiness2017.csv", header=TRUE)
  2. master_dataframe <- dataframe
  3. set.seed(0897)
  4. dataframe = dataframe[complete.cases(dataframe),] #Removing countries with missing values
  5. dataframe = as.matrix(dataframe[2:11]) #Removing Countries Column
  6. dataframe <- dataframe[sample(nrow(dataframe), 100), ] #Taking Sample
  7. #Multivariate ScatterPlot
  8. library(car)
  9. library(Rfast)
  10. scatterplotMatrix(dataframe, main= "Matrix ScatterPlot | Eric Lee, Michael Du")
  11.  
  12. #Summary Statistics
  13. colMeans(dataframe) #Means
  14. colVars(dataframe) #Var
  15. #We probably need other summ statistics but i dont know which ones? :O
  16.  
  17.  
  18. #calculating outliers
  19. ncol(dataframe)
  20. S = cov(dataframe)
  21. for(j in 1:ncol(dataframe)){
  22. xbar = mean(dataframe[,j])
  23. for(i in 1:nrow(dataframe)){
  24. current_value = (dataframe[i,j])
  25. standardized_value = (current_value - xbar)/sqrt(S[j, j])
  26. str <- paste("(", current_value, "-", xbar, ")", "/", "sqrt(",S[j,j], ")")
  27. #print(str)
  28. #print(standardized_value)
  29. if (standardized_value > 3.5){
  30. outlier_string <- paste("[", i, ",", j, "]", "is an outlier! value =", standardized_value)
  31. print(outlier_string)
  32. }
  33. }
  34. }
  35.  
  36. #No outliers omegalol, good work for nothing. Closest has value 3.427 < 3.5 at [49,6]
  37. #Next, formal way of checking for Multivariate Normality
  38. S_inv = solve(S)
  39. d_vector = c()
  40. for(i in 1:nrow(dataframe)){
  41. xi = dataframe[i,]
  42. for(j in 1:ncol(dataframe)){
  43. xbar = mean(dataframe[,j])
  44. xi[j] = xi[j] - xbar
  45. }
  46. d_value = t(xi) %*% S_inv %*% xi
  47. print(d_value)
  48. d_vector <- c(d_vector, d_value)
  49. }
  50. d_vector <- sort(d_vector)
  51. #getting chisq quantiles
  52. chisq_vector = c()
  53. for(i in 1:nrow(dataframe)){
  54. chisq_value = qchisq(p=(i-1/2)/nrow(dataframe), df=ncol(dataframe)-1)
  55. chisq_vector <- c(chisq_vector, chisq_value)
  56. }
  57. plot(d_vector, chisq_vector, xlab=expression("d"[i]^2), ylab="Chi-squared Quantiles", main="Checking Joint Normality | Eric Lee, Michael Du")
  58. abline(a = 0, b = 1, col="red")
  59. #Looks about normal! yay :)
  60.  
  61. #linear model
  62. dataframe <- as.data.frame(dataframe)
  63. lin_model <- lm(Ladder ~ LogGDP + Social + HLE + Freedom + Generosity + Corruption + Positive + Negative + gini,
  64. data = dataframe)
  65. summary(lin_model)
  66.  
  67. #PC Regression Model
  68. data <- dataframe[,2:10]
  69. data.bar = apply(data, 2, mean)
  70. Z = data
  71. for(i in 1:6){
  72. Z[,i] = (data[,i]-data.bar[i])/sqrt(diag(S)[i])
  73. }
  74. R = cor(data)
  75. Val.new = eigen(R)$values
  76. round(Val.new ,2)
  77.  
  78. Vec.new = eigen(R)$vectors
  79. rownames(Vec.new) = colnames(data)
  80. colnames(Vec.new) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
  81. round(Vec.new ,2)
  82.  
  83. # Obtain sample PC values:
  84.  
  85. W.new = data # create matrix same size as data
  86. colnames(W.new) = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
  87. Z = as.matrix(Z)
  88. for(i in 1:9){ # PC's
  89. for(j in 1:100){ #of rows
  90. W.new[j,i] = Vec.new[,i] %*% Z[j,]
  91. }}
  92.  
  93. plot(data.frame(W.new))
  94. round( cor(W.new),3)
  95.  
  96. # How many components should we keep?
  97. #######################################
  98. # screeplot:
  99.  
  100. plot(Val.new, type="b", pch=19, xlab="",ylab="Variances") # suggests keeping the first 4 or 5 PCs.
  101.  
  102. # Proportion of variation explained by each PC:
  103.  
  104. round( Val.new/sum(Val.new),3)
  105.  
  106. # Regression with all standardized PCs as the explanatory variables
  107. ##############################################################################
  108.  
  109. PC.model.new.1 = lm(dataframe$Ladder ~ W.new[,1] + W.new[,2] + W.new[,3] + W.new[,4] + W.new[,5]+ W.new[,6]+ W.new[,7]+ W.new[,8]+ W.new[,9])
  110. summary(PC.model.new.1) # W4, W5, W7, and W9 seem not to be significant
  111.  
  112. # Let's remove these
  113.  
  114. PC.model.new.2 = lm(dataframe$Ladder ~ W.new[,1] + W.new[,2] + W.new[,3]+ W.new[,6]+ W.new[,8])
  115. summary(PC.model.new.2)
  116. vif(PC.model.new.2)
  117.  
  118. # check the importance of each variable in standardized PCs:
  119.  
  120. round(Vec.new[,1],3) #see reference in discord
  121. round(Vec.new[,2],3)
  122. round(Vec.new[,3],3)
  123. round(Vec.new[,4],3)
  124. round(Vec.new[,5],3)
  125. round(Vec.new[,6],3)
  126. round(Vec.new[,7],3)
  127. round(Vec.new[,8],3)
  128. round(Vec.new[,9],3)
  129.  
  130. # the correlations between PCs and variables also indicate importance:
  131.  
  132. cor.WX = function(mat){
  133. vals= mat
  134. rownames(vals) = paste("X", 1:nrow(mat), sep="")
  135. colnames(vals) = paste("W", 1:nrow(mat), sep="")
  136.  
  137. for(i in 1: ncol(vals)){
  138. val= t(eigen(mat)$vectors[,i])* sqrt( eigen(mat)$values[i])
  139. vals[,i] = val/sqrt(diag(mat))
  140. }
  141. return(vals)
  142.  
  143. }
  144. cor.WX(R)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement