Guest User

Untitled

a guest
Oct 16th, 2018
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.68 KB | None | 0 0
  1. setwd("~/Desktop/Second Year/CS112/Assignments/RegressionandBootstrapping")
  2. # Question 1:
  3.  
  4. # Data generation
  5. # The relationship between price of one kg of Uji matcha and the average year earning in the town of Uji
  6. set.seed(123)
  7. uji_price <- runif(99, min = 10, max = 100)
  8. earnings = c()
  9. for (i in 1:length(uji_price)) {
  10. earnings[i] = uji_price[i] * 100 + rnorm(1, mean = 0, sd = 750)
  11. }
  12.  
  13. dataset <- data.frame(uji_price, earnings)
  14.  
  15. fit <- glm(earnings ~ uji_price, data = dataset)
  16. summary(fit)
  17. plot(dataset, pch = 20)
  18. abline(fit$coefficients, col = 'red')
  19. # adding an outlier
  20. dataset <- rbind(dataset, c(1600, 1))
  21. plot(dataset, col = "skyblue", pch = 20)
  22.  
  23. # highlight the outlier
  24. points(dataset$uji_price[100], dataset$earnings[100], col = "red", pch = 20)
  25.  
  26. outlier_fit <- glm(earnings~ uji_price, data = dataset)
  27. summary(outlier_fit)
  28. abline(outlier_fit$coefficients, col = 'red')
  29. abline(fit$coefficients, col = 'blue')
  30.  
  31. # need explanation for the danger of extrapolation
  32.  
  33.  
  34. # Question 2
  35. library(Matching)
  36. data("lalonde")
  37. # only use the control group
  38. which(lalonde$treat==0)
  39. control <- lalonde[which(lalonde$treat==0),]
  40. # age, educ, re74, re75, educ*re74, educ*re75, age*re74, age*re75, and re74*re75 to predict re78
  41. linear <- lm(re78~age+educ+re74+re75+educ*re74+educ*re75+age*re74+age*re75+ re74*re75,
  42. data = control)
  43. summary(linear)
  44. library(arm)
  45. simulations <- sim(linear, n.sims = 10000)
  46. # make predictions intervals for every unit (different ages)
  47. # what is the range:
  48. range(control$age) #we need to do predictions for ages from 17 to 55
  49.  
  50. storage <- matrix(NA, nrow = 10000, ncol = 39)
  51. # 10,000 simulations with 39 ages (from 17 to 55)
  52. meduc <- median(control$educ)
  53. mre74 <- median(control$re74)
  54. mre75 <- median(control$re75)
  55. mean_predict = c()
  56. for (age in c(17:55)) {
  57. # first set the unit vector with different age but median of other variables
  58. unit = c(1, age, meduc, mre74, mre75, meduc*mre74, meduc*mre75, age*mre74, age*mre75, mre74*mre75)
  59. for (i in 1:10000) {
  60. # make sure to include irreducible error ;)
  61. prediction = sum(unit * simulations@coef[i,]) + rnorm(1, 0, simulations@sigma[i])
  62. storage[i, age-16] = prediction
  63. }
  64. mean_predict[age-16] = mean(storage[,age-16])
  65.  
  66. }
  67. # from the storage, calculate the quantile prediction interval
  68. predict.interval <- apply(storage, 2, quantile, probs = c(0.025, 0.975))
  69.  
  70. # plot prediction interval
  71. plot(x = c(1:10), y = c(1:10), type = "n", xlim = c(17,55), ylim = c(-7500,15000))
  72. points(x = c(17:55), y = mean_predict, type = 'p', cex = 1, col='blue')
  73. counter = 1
  74. for(age in 17:55) {
  75. segments(x0 = age, y0 = predict.interval[counter], x1 = age, y1 = predict.interval[counter + 1]);
  76. counter = counter + 2
  77. }
  78.  
  79. # keep every variable constant at the 90th percentile
  80. storage <- matrix(NA, nrow = 10000, ncol = 39)
  81. # 10,000 simulations with 39 ages (from 17 to 55)
  82. meduc <- quantile(control$educ, probs = c(0.9))
  83. mre74 <- quantile(control$re74, probs = c(0.9))
  84. mre75 <- quantile(control$re75, probs = c(0.9))
  85. for (age in c(17:55)) {
  86. # first set the unit vector with different age but median of other variables
  87. unit = c(1, age, meduc, mre74, mre75, meduc*mre74, meduc*mre75, age*mre74, age*mre75, mre74*mre75)
  88. for (i in 1:10000) {
  89. # make sure to include irreducible error ;)
  90. prediction = sum(unit * simulations@coef[i,])+ rnorm(1, 0, simulations@sigma[i])
  91. storage[i, age-16] = prediction
  92. }
  93. mean_predict[age-16] = mean(storage[,age-16])
  94. }
  95. # from the storage, calculate the quantile prediction interval
  96. predict.interval <- apply(storage, 2, quantile, probs = c(0.025, 0.975))
  97.  
  98. # plot prediction interval
  99. plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-7000,20000))
  100. points(x = c(17:55), y = mean_predict, type = 'p', cex = 1, col='blue')
  101.  
  102. counter = 1
  103. for(age in 17:55) {
  104. segments(x0 = age, y0 = predict.interval[counter], x1 = age, y1 = predict.interval[counter + 1]);
  105. counter = counter + 2
  106. }
  107.  
  108.  
  109.  
  110. # Question 3
  111. library(foreign)
  112. nsw = read.dta('nsw.dta')
  113. fit <- lm(re78~treat, data = nsw)
  114. summary(fit)
  115. # analytical confidence interval for the coefficient value
  116. analytical <- confint(fit, level = 0.95)
  117. # bootstrap
  118. library(boot)
  119. mean.func = function(data, index) {
  120. d = data
  121. X = d$treat[index]
  122. Y = d$re78[index]
  123. regressor <- lm(Y~X, data = d)
  124. return(c(regressor$coefficients[1], regressor$coefficients[2]))
  125. }
  126. bootstrapped = boot(nsw, mean.func, R = 1000)
  127. # for the intercept
  128. intercept_ci = boot.ci(bootstrapped, index = 1)$normal
  129. # for the slope
  130. slope_ci = boot.ci(bootstrapped, index = 2)$normal
  131.  
  132. # table of the two variables
  133. table = rbind("Intercept (analytical)" = analytical[1,], "Intercept (bootstraped)" = intercept_ci[, 2:3],
  134. "Treat (analytical)" = analytical[2,], "Treat (bootstraped)" = slope_ci[, 2:3])
  135. table
  136.  
  137. # histogram for the bootstrap-sample result
  138. # For intercept
  139. hist(bootstrapped$t[,1], col='green', ylim=c(0,300), main="Histogram of the Bootstrapped Values",
  140. xlab="Coeffcients (Intercept)")
  141. par(new = T)
  142. coefficients = summary(fit)$coefficients
  143. # illustrating the distribution of the coefficients with the bootstrap
  144. x = seq(coefficients[1] - 4*coefficients[3], coefficients[1] + 4*coefficients[3], length=5000)
  145. y = dnorm(x, mean =coefficients[1], sd=coefficients[3])
  146. plot(x, y,type="l", lwd=3, axes=F, ylab=NA, xlab = NA)
  147. axis(side = 4, line = -1, at=seq(0,0.0016, by=0.0002))
  148. mtext(side = 4, line = 1 , 'Probability')
  149.  
  150.  
  151. # For the variable treat
  152. hist(bootstrapped$t[, 2],col='skyblue', ylim=c(0,200), main="Histogram of the Bootstrapped Values",
  153. xlab="Coeffcients (Slope)")
  154. par(new = T)
  155. coefficients = summary(fit)$coefficients
  156. x = seq(coefficients[2] - 4*coefficients[4], coefficients[2] + 4*coefficients[4], length=5000)
  157. y = dnorm(x, mean =coefficients[2], sd=coefficients[4])
  158. plot(x, y,type="l", lwd=3, axes=F, xlab=NA, ylab=NA)
  159. axis(side = 4, line = -1, at=seq(0,0.0016, by=0.0002))
  160. mtext(side = 4, line = 1 , 'Probability')
  161.  
  162.  
  163. # Question 4
  164. # Function that takes Ys and predicted Ys as inputs and output R^2
  165. # formula: (sum of square of total - sum of squares of residuals) / sum of square of total
  166.  
  167. r_squared = function (y, y_pred) {
  168. means = mean(y)
  169. ss_total = 0
  170. ss_r = 0
  171. for (i in 1:length(y)){
  172. ss_total = ss_total + (y[i] - means)^2
  173. ss_r = ss_r + (y[i] - y_pred[i])^2
  174. }
  175.  
  176. return((ss_total - ss_r)/ss_total)
  177. }
  178.  
  179. # import the data
  180. nsw = read.dta('nsw.dta')
  181. nsw = nsw[-1]
  182. # create a linear model in order to find the r squared and use to predict y values
  183. fit <- lm(re78~treat, data = nsw)
  184. y_pred = predict(fit)
  185. y = nsw$re78
  186. # plug in both the actual outcome and predicted outcome
  187. r_squared(y, y_pred)
  188. # compare the result with the r^2 computed by the lm() method
  189. summary(fit)
  190.  
  191.  
  192. # Question 5
  193. # Logistic regression
  194. logistic = glm(treat ~. - re78, data = nsw, family=binomial)
  195.  
  196. # Predicting the propensity score of each observation
  197. prob = predict(logistic, type = 'response')
  198. control = subset(nsw, treat==0) # A subset containing only units assigned to the control group
  199. treat = subset(nsw, treat == 1) # A subset containing only units assigned to the treatment group
  200. con_prob = predict(logistic, newdata = control, type = 'response') # propensity score of control group units
  201. treat_prob = predict(logistic, newdata = treat, type = 'response') # propensity score of treatment group units
  202. hist(con_prob, col = 'blue')
  203. legend("topright", "Treatment Group Probability", col = c("blue"), lwd=20, cex = 0.7)
  204. hist(treat_prob, col = 'purple')
  205. legend("topright", "Treatment Group Probability", col = c("purple"), lwd=20, cex = 0.7)
  206.  
  207. # nicer histogram with overlapping and legends
  208. con_probdf = data.frame(con_prob)
  209. treat_probdf = data.frame(treat_prob)
  210. library(ggplot2)
  211. plot <- ggplot(con_probdf, aes(x=con_prob)) +
  212. geom_histogram(color="black", fill="skyblue", bins = 20)+
  213. geom_vline(aes(xintercept=mean(con_prob)), color="blue", linetype="dashed", size=1)
  214.  
  215. plot
  216. plot2 <- ggplot(treat_probdf, aes(x=treat_prob)) +
  217. geom_histogram(color="black", fill="purple", bins = 20)+
  218. geom_vline(aes(xintercept=mean(treat_prob)), color="red", linetype="dashed", size=1)
  219.  
  220. plot2
  221.  
  222. library(plyr)
  223.  
  224. combined_df = data.frame('treat' = nsw$treat, "probability" = prob)
  225. combined_df$treat = as.factor(combined_df$treat)
  226. mu <- ddply(combined_df, "treat", summarise, grp.mean=mean(probability))
  227.  
  228.  
  229. combined <- ggplot(combined_df, aes(x = probability, fill = treat)) +
  230. geom_histogram(alpha=0.5, position="identity", binwidth = 0.005)+
  231. geom_vline(data=mu, aes(xintercept=grp.mean, color=treat),
  232. linetype="dashed")+
  233. theme(legend.position="top")
  234. combined
  235.  
  236. # Density plot
  237. density <- ggplot(combined_df, aes(x = probability, fill = treat)) +
  238. geom_histogram(aes(y=..density..), alpha=0.5, position="identity", binwidth = 0.005)+
  239. geom_vline(data=mu, aes(xintercept=grp.mean, color=treat),
  240. linetype="dashed")+
  241. theme(legend.position="top") +
  242. geom_density(alpha=0.3)
  243. density
Add Comment
Please, Sign In to add comment