Untitled

library(ggplot2)
library(dplyr)

set.seed(420)

setwd("/if/fame/gcm/usr/george/math658")

ID <- 1:300
IDschool <- rep(1:15, 20)
question1 <- rbinom(n = 100, size = 1, prob = 0.17)
question2 <- round(rnorm(n = 100,  mean = 2.85, sd = 0.4), digits = 1)
question3 <- round(rnorm(n = 100,  mean = 3.7, sd = 0.23), digits = 1)
question4 <- round(rnorm(n = 100,  mean = 1.802, sd = 0.2), digits = 1)
question5 <- sample(c(2.4, 2.25, 2.3, 2.45, 2.5), prob = c(0.1, 0.3, 0.4, 0.1, 0.1))

survey <- data.frame(ID = ID, IDschool = IDschool, Q1 = question1, Q2 = question2, Q3 = question3, Q4 = question4, Q5 = question5)

#uncomment if you want to create a new survey dataset
#write_csv(survey, 'survey.csv')

#pilot survey
surveydf = read.csv("survey.csv",header = TRUE)
keep = c(1)
gtdf = dplyr::filter(surveydf, IDschool %in% keep)
pilotdf = head(gtdf,10)
var(pilotdf$Q2)

#complete survey
keep = c(2,3,4,5,6)
#Lets say UMD - job market(2), Carnegie Mellon University - full list(3), Penn - full list(4), Johns Hopkins - full list(5), Temple - full list(6)
sampledf = dplyr::filter(surveydf, IDschool %in% keep)
#Mark UMD as job market candidate
sampledf$listtype <- as.numeric(sampledf$IDschool == 2)

#Right now the story is we selected 10 students from each uni, 2 students did not respond, those students are from John Hopkins and Temple
sampledf = head(sampledf,48)

#ybarhat 2 clust
#N/n
N=15
n=5
coef1 = N/n
#M_i/m_i
Mi = 20
coef2_234 =  Mi/10
coef2_56 = Mi/9
Mzero = 300

aggdf <- aggregate(sampledf[, 3:7], list(sampledf$IDschool), sum)

first_colsums <- colSums(aggdf[1:3,])
second_colsums <- colSums(aggdf[4:5,])

first_colsums <- coef1*coef2_234*first_colsums
second_colsums <- coef1*coef2_56*second_colsums

yclust <- first_colsums + second_colsums
yclust <- yclust/Mzero

yclust <- yclust[c(2,3,4,5,6)]

#now estimate variance
Mbar = 300/15

unimeansdf <- aggregate(sampledf[, 3:7], list(sampledf$IDschool), mean)
names(unimeansdf) <- c("IDschool", "Q1mean","Q2mean","Q3mean","Q4mean","Q5mean")
s_t_squared <- ((Mi*unimeansdf[c(2,3,4,5,6)] - Mbar*yclust)^2)/(n-1)

s_i_squared <- merge(sampledf, unimeansdf,by="IDschool")

colMeans(surveydf[,3:7])