Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(ggplot2)
- library(dplyr)
- set.seed(420)
- setwd("/if/fame/gcm/usr/george/math658")
- ID <- 1:300
- IDschool <- rep(1:15, 20)
- question1 <- rbinom(n = 100, size = 1, prob = 0.17)
- question2 <- round(rnorm(n = 100, mean = 2.85, sd = 0.4), digits = 1)
- question3 <- round(rnorm(n = 100, mean = 3.7, sd = 0.23), digits = 1)
- question4 <- round(rnorm(n = 100, mean = 1.802, sd = 0.2), digits = 1)
- question5 <- sample(c(2.4, 2.25, 2.3, 2.45, 2.5), prob = c(0.1, 0.3, 0.4, 0.1, 0.1))
- survey <- data.frame(ID = ID, IDschool = IDschool, Q1 = question1, Q2 = question2, Q3 = question3, Q4 = question4, Q5 = question5)
- #uncomment if you want to create a new survey dataset
- #write_csv(survey, 'survey.csv')
- #pilot survey
- surveydf = read.csv("survey.csv",header = TRUE)
- keep = c(1)
- gtdf = dplyr::filter(surveydf, IDschool %in% keep)
- pilotdf = head(gtdf,10)
- var(pilotdf$Q2)
- #complete survey
- keep = c(2,3,4,5,6)
- #Lets say UMD - job market(2), Carnegie Mellon University - full list(3), Penn - full list(4), Johns Hopkins - full list(5), Temple - full list(6)
- sampledf = dplyr::filter(surveydf, IDschool %in% keep)
- #Mark UMD as job market candidate
- sampledf$listtype <- as.numeric(sampledf$IDschool == 2)
- #Right now the story is we selected 10 students from each uni, 2 students did not respond, those students are from John Hopkins and Temple
- sampledf = head(sampledf,48)
- #ybarhat 2 clust
- #N/n
- N=15
- n=5
- coef1 = N/n
- #M_i/m_i
- Mi = 20
- coef2_234 = Mi/10
- coef2_56 = Mi/9
- Mzero = 300
- aggdf <- aggregate(sampledf[, 3:7], list(sampledf$IDschool), sum)
- first_colsums <- colSums(aggdf[1:3,])
- second_colsums <- colSums(aggdf[4:5,])
- first_colsums <- coef1*coef2_234*first_colsums
- second_colsums <- coef1*coef2_56*second_colsums
- yclust <- first_colsums + second_colsums
- yclust <- yclust/Mzero
- yclust <- yclust[c(2,3,4,5,6)]
- #now estimate variance
- Mbar = 300/15
- unimeansdf <- aggregate(sampledf[, 3:7], list(sampledf$IDschool), mean)
- names(unimeansdf) <- c("IDschool", "Q1mean","Q2mean","Q3mean","Q4mean","Q5mean")
- s_t_squared <- ((Mi*unimeansdf[c(2,3,4,5,6)] - Mbar*yclust)^2)/(n-1)
- s_i_squared <- merge(sampledf, unimeansdf,by="IDschool")
- colMeans(surveydf[,3:7])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement