Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- setwd("~/Desktop/Second Year/CS112/Assignments/RegressionandBootstrapping")
- # Question 1:
- # Data generation
- # The relationship between price of one kg of Uji matcha and the average year earning in the town of Uji
- set.seed(123)
- uji_price <- runif(99, min = 10, max = 100)
- earnings = c()
- for (i in 1:length(uji_price)) {
- earnings[i] = uji_price[i] * 100 + rnorm(1, mean = 0, sd = 750)
- }
- dataset <- data.frame(uji_price, earnings)
- fit <- glm(earnings ~ uji_price, data = dataset)
- summary(fit)
- plot(dataset, pch = 20)
- abline(fit$coefficients, col = 'red')
- # adding an outlier
- dataset <- rbind(dataset, c(1600, 1))
- plot(dataset, col = "skyblue", pch = 20)
- # highlight the outlier
- points(dataset$uji_price[100], dataset$earnings[100], col = "red", pch = 20)
- outlier_fit <- glm(earnings~ uji_price, data = dataset)
- summary(outlier_fit)
- abline(outlier_fit$coefficients, col = 'red')
- abline(fit$coefficients, col = 'blue')
- # need explanation for the danger of extrapolation
- # Question 2
- library(Matching)
- data("lalonde")
- # only use the control group
- which(lalonde$treat==0)
- control <- lalonde[which(lalonde$treat==0),]
- # age, educ, re74, re75, educ*re74, educ*re75, age*re74, age*re75, and re74*re75 to predict re78
- linear <- lm(re78~age+educ+re74+re75+educ*re74+educ*re75+age*re74+age*re75+ re74*re75,
- data = control)
- summary(linear)
- library(arm)
- simulations <- sim(linear, n.sims = 10000)
- # make predictions intervals for every unit (different ages)
- # what is the range:
- range(control$age) #we need to do predictions for ages from 17 to 55
- storage <- matrix(NA, nrow = 10000, ncol = 39)
- # 10,000 simulations with 39 ages (from 17 to 55)
- meduc <- median(control$educ)
- mre74 <- median(control$re74)
- mre75 <- median(control$re75)
- mean_predict = c()
- for (age in c(17:55)) {
- # first set the unit vector with different age but median of other variables
- unit = c(1, age, meduc, mre74, mre75, meduc*mre74, meduc*mre75, age*mre74, age*mre75, mre74*mre75)
- for (i in 1:10000) {
- # make sure to include irreducible error ;)
- prediction = sum(unit * simulations@coef[i,]) + rnorm(1, 0, simulations@sigma[i])
- storage[i, age-16] = prediction
- }
- mean_predict[age-16] = mean(storage[,age-16])
- }
- # from the storage, calculate the quantile prediction interval
- predict.interval <- apply(storage, 2, quantile, probs = c(0.025, 0.975))
- # plot prediction interval
- plot(x = c(1:10), y = c(1:10), type = "n", xlim = c(17,55), ylim = c(-7500,15000))
- points(x = c(17:55), y = mean_predict, type = 'p', cex = 1, col='blue')
- counter = 1
- for(age in 17:55) {
- segments(x0 = age, y0 = predict.interval[counter], x1 = age, y1 = predict.interval[counter + 1]);
- counter = counter + 2
- }
- # keep every variable constant at the 90th percentile
- storage <- matrix(NA, nrow = 10000, ncol = 39)
- # 10,000 simulations with 39 ages (from 17 to 55)
- meduc <- quantile(control$educ, probs = c(0.9))
- mre74 <- quantile(control$re74, probs = c(0.9))
- mre75 <- quantile(control$re75, probs = c(0.9))
- for (age in c(17:55)) {
- # first set the unit vector with different age but median of other variables
- unit = c(1, age, meduc, mre74, mre75, meduc*mre74, meduc*mre75, age*mre74, age*mre75, mre74*mre75)
- for (i in 1:10000) {
- # make sure to include irreducible error ;)
- prediction = sum(unit * simulations@coef[i,])+ rnorm(1, 0, simulations@sigma[i])
- storage[i, age-16] = prediction
- }
- mean_predict[age-16] = mean(storage[,age-16])
- }
- # from the storage, calculate the quantile prediction interval
- predict.interval <- apply(storage, 2, quantile, probs = c(0.025, 0.975))
- # plot prediction interval
- plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,55), ylim = c(-7000,20000))
- points(x = c(17:55), y = mean_predict, type = 'p', cex = 1, col='blue')
- counter = 1
- for(age in 17:55) {
- segments(x0 = age, y0 = predict.interval[counter], x1 = age, y1 = predict.interval[counter + 1]);
- counter = counter + 2
- }
- # Question 3
- library(foreign)
- nsw = read.dta('nsw.dta')
- fit <- lm(re78~treat, data = nsw)
- summary(fit)
- # analytical confidence interval for the coefficient value
- analytical <- confint(fit, level = 0.95)
- # bootstrap
- library(boot)
- mean.func = function(data, index) {
- d = data
- X = d$treat[index]
- Y = d$re78[index]
- regressor <- lm(Y~X, data = d)
- return(c(regressor$coefficients[1], regressor$coefficients[2]))
- }
- bootstrapped = boot(nsw, mean.func, R = 1000)
- # for the intercept
- intercept_ci = boot.ci(bootstrapped, index = 1)$normal
- # for the slope
- slope_ci = boot.ci(bootstrapped, index = 2)$normal
- # table of the two variables
- table = rbind("Intercept (analytical)" = analytical[1,], "Intercept (bootstraped)" = intercept_ci[, 2:3],
- "Treat (analytical)" = analytical[2,], "Treat (bootstraped)" = slope_ci[, 2:3])
- table
- # histogram for the bootstrap-sample result
- # For intercept
- hist(bootstrapped$t[,1], col='green', ylim=c(0,300), main="Histogram of the Bootstrapped Values",
- xlab="Coeffcients (Intercept)")
- par(new = T)
- coefficients = summary(fit)$coefficients
- # illustrating the distribution of the coefficients with the bootstrap
- x = seq(coefficients[1] - 4*coefficients[3], coefficients[1] + 4*coefficients[3], length=5000)
- y = dnorm(x, mean =coefficients[1], sd=coefficients[3])
- plot(x, y,type="l", lwd=3, axes=F, ylab=NA, xlab = NA)
- axis(side = 4, line = -1, at=seq(0,0.0016, by=0.0002))
- mtext(side = 4, line = 1 , 'Probability')
- # For the variable treat
- hist(bootstrapped$t[, 2],col='skyblue', ylim=c(0,200), main="Histogram of the Bootstrapped Values",
- xlab="Coeffcients (Slope)")
- par(new = T)
- coefficients = summary(fit)$coefficients
- x = seq(coefficients[2] - 4*coefficients[4], coefficients[2] + 4*coefficients[4], length=5000)
- y = dnorm(x, mean =coefficients[2], sd=coefficients[4])
- plot(x, y,type="l", lwd=3, axes=F, xlab=NA, ylab=NA)
- axis(side = 4, line = -1, at=seq(0,0.0016, by=0.0002))
- mtext(side = 4, line = 1 , 'Probability')
- # Question 4
- # Function that takes Ys and predicted Ys as inputs and output R^2
- # formula: (sum of square of total - sum of squares of residuals) / sum of square of total
- r_squared = function (y, y_pred) {
- means = mean(y)
- ss_total = 0
- ss_r = 0
- for (i in 1:length(y)){
- ss_total = ss_total + (y[i] - means)^2
- ss_r = ss_r + (y[i] - y_pred[i])^2
- }
- return((ss_total - ss_r)/ss_total)
- }
- # import the data
- nsw = read.dta('nsw.dta')
- nsw = nsw[-1]
- # create a linear model in order to find the r squared and use to predict y values
- fit <- lm(re78~treat, data = nsw)
- y_pred = predict(fit)
- y = nsw$re78
- # plug in both the actual outcome and predicted outcome
- r_squared(y, y_pred)
- # compare the result with the r^2 computed by the lm() method
- summary(fit)
- # Question 5
- # Logistic regression
- logistic = glm(treat ~. - re78, data = nsw, family=binomial)
- # Predicting the propensity score of each observation
- prob = predict(logistic, type = 'response')
- control = subset(nsw, treat==0) # A subset containing only units assigned to the control group
- treat = subset(nsw, treat == 1) # A subset containing only units assigned to the treatment group
- con_prob = predict(logistic, newdata = control, type = 'response') # propensity score of control group units
- treat_prob = predict(logistic, newdata = treat, type = 'response') # propensity score of treatment group units
- hist(con_prob, col = 'blue')
- legend("topright", "Treatment Group Probability", col = c("blue"), lwd=20, cex = 0.7)
- hist(treat_prob, col = 'purple')
- legend("topright", "Treatment Group Probability", col = c("purple"), lwd=20, cex = 0.7)
- # nicer histogram with overlapping and legends
- con_probdf = data.frame(con_prob)
- treat_probdf = data.frame(treat_prob)
- library(ggplot2)
- plot <- ggplot(con_probdf, aes(x=con_prob)) +
- geom_histogram(color="black", fill="skyblue", bins = 20)+
- geom_vline(aes(xintercept=mean(con_prob)), color="blue", linetype="dashed", size=1)
- plot
- plot2 <- ggplot(treat_probdf, aes(x=treat_prob)) +
- geom_histogram(color="black", fill="purple", bins = 20)+
- geom_vline(aes(xintercept=mean(treat_prob)), color="red", linetype="dashed", size=1)
- plot2
- library(plyr)
- combined_df = data.frame('treat' = nsw$treat, "probability" = prob)
- combined_df$treat = as.factor(combined_df$treat)
- mu <- ddply(combined_df, "treat", summarise, grp.mean=mean(probability))
- combined <- ggplot(combined_df, aes(x = probability, fill = treat)) +
- geom_histogram(alpha=0.5, position="identity", binwidth = 0.005)+
- geom_vline(data=mu, aes(xintercept=grp.mean, color=treat),
- linetype="dashed")+
- theme(legend.position="top")
- combined
- # Density plot
- density <- ggplot(combined_df, aes(x = probability, fill = treat)) +
- geom_histogram(aes(y=..density..), alpha=0.5, position="identity", binwidth = 0.005)+
- geom_vline(data=mu, aes(xintercept=grp.mean, color=treat),
- linetype="dashed")+
- theme(legend.position="top") +
- geom_density(alpha=0.3)
- density
Add Comment
Please, Sign In to add comment