SHOW:
|
|
- or go back to the newest paste.
1 | - | ## Create a simulated dataset with known parameters, and then run a ML multilevel model, a ML SEM, and a Bayesian multilevel model; with the last, calculate Expected Value of Sample Information (EVSI): |
1 | + | ## Create a simulated dataset with known parameters, and then run a ML multilevel model, a ML SEM, |
2 | ## and a Bayesian multilevel model; with the last, calculate Expected Value of Sample Information (EVSI): | |
3 | ||
4 | ## SIMULATE | |
5 | set.seed(2015-08-11) | |
6 | ## "There is a variable X, x belongs to [0, 100]." | |
7 | toplevel <- rnorm(n=1, 50, 25) | |
8 | ## "There are n ways of measuring it, among them A and B are widely used." | |
9 | ## "For any given measurer, the difference between x(A) and x(B) can be up to 20 points." | |
10 | A <- toplevel + runif(1, min=-10, max=10) | |
11 | B <- toplevel + runif(1, min=-10, max=10) | |
12 | c(toplevel, A, B) | |
13 | # [1] 63.85938385 55.43608379 59.42333264 | |
14 | ### the true level of X we wish to recover is '63.85' | |
15 | ||
16 | ## "Between two any measurers, x(A)1 and x(A)2 can differ on average 10 points, likewise with B." | |
17 | ### let's imagine 10 hypothetical points are sample using method A and method B | |
18 | ### assume 'differ on average 10 points' here means something like 'the standard deviation is 10' | |
19 | A_1 <- rnorm(n=10, mean=A, sd=10) | |
20 | B_1 <- rnorm(n=10, mean=B, sd=10) | |
21 | ||
22 | data <- rbind(data.frame(Measurement="A", Y=A_1), data.frame(Measurement="B", Y=B_1)); data | |
23 | # Measurement Y | |
24 | # 1 A 56.33870025 | |
25 | # 2 A 69.07267213 | |
26 | # 3 A 40.36889573 | |
27 | # 4 A 48.67289213 | |
28 | # 5 A 79.92622603 | |
29 | # 6 A 62.86919410 | |
30 | # 7 A 53.12953462 | |
31 | # 8 A 66.58894990 | |
32 | # 9 A 47.86296948 | |
33 | # 10 A 60.72416003 | |
34 | # 11 B 68.60203507 | |
35 | # 12 B 58.24702007 | |
36 | # 13 B 45.47895879 | |
37 | # 14 B 63.45308935 | |
38 | # 15 B 52.27724328 | |
39 | # 16 B 56.89783535 | |
40 | # 17 B 55.93598486 | |
41 | # 18 B 59.28162022 | |
42 | # 19 B 70.92341777 | |
43 | # 20 B 49.51360373 | |
44 | ||
45 | ## MLM | |
46 | ||
47 | ## multi-level model approach: | |
48 | library(lme4) | |
49 | mlm <- lmer(Y ~ (1|Measurement), data=data); summary(mlm) | |
50 | # Random effects: | |
51 | # Groups Name Variance Std.Dev. | |
52 | # Measurement (Intercept) 0.0000 0.000000 | |
53 | # Residual 95.3333 9.763877 | |
54 | # Number of obs: 20, groups: Measurement, 2 | |
55 | # | |
56 | # Fixed effects: | |
57 | # Estimate Std. Error t value | |
58 | # (Intercept) 58.308250 2.183269 26.70685 | |
59 | confint(mlm) | |
60 | # 2.5 % 97.5 % | |
61 | # .sig01 0.000000000 7.446867736 | |
62 | # .sigma 7.185811525 13.444112087 | |
63 | # (Intercept) 53.402531768 63.213970887 | |
64 | ||
65 | ## So we estimate X at 58.3 but it's not inside our confidence interval with such little data. Bad luck? | |
66 | ||
67 | ## SEM | |
68 | ||
69 | library(lavaan) | |
70 | X.model <- ' X =~ B + A | |
71 | A =~ a | |
72 | B =~ b' | |
73 | X.fit <- sem(model = X.model, meanstructure = TRUE, data = data2) | |
74 | summary(X.fit) | |
75 | # ... Estimate Std.err Z-value P(>|z|) | |
76 | # Latent variables: | |
77 | # X =~ | |
78 | # B 1.000 | |
79 | # A 7619.504 | |
80 | # A =~ | |
81 | # a 1.000 | |
82 | # B =~ | |
83 | # b 1.000 | |
84 | # | |
85 | # Intercepts: | |
86 | # a 58.555 | |
87 | # b 58.061 | |
88 | # X 0.000 | |
89 | # A 0.000 | |
90 | # B 0.000 | |
91 | ## Well, that didn't work well - explodes, unfortunately. Probably still not enough data. | |
92 | ||
93 | ## MLM (Bayesian) | |
94 | ||
95 | library(R2jags) | |
96 | ## rough attempt at writing down an explicit multilevel model which | |
97 | ## respects the mentioned priors about errors being reasonably small: | |
98 | model <- function() { | |
99 | grand.mean ~ dunif(0,100) | |
100 | ||
101 | delta.between.group ~ dunif(0, 10) | |
102 | ||
103 | sigma.between.group ~ dunif(0, 100) | |
104 | tau.between.group <- pow(sigma.between.group, -2) | |
105 | ||
106 | for(j in 1:K){ | |
107 | # let's say the group-level differences are also normally-distributed: | |
108 | group.delta[j] ~ dnorm(delta.between.group, tau.between.group) | |
109 | # and each group also has its own standard-deviation, potentially different from the others': | |
110 | group.within.sigma[j] ~ dunif(0, 20) | |
111 | group.within.tau[j] <- pow(group.within.sigma[j], -2) | |
112 | ||
113 | # save the net combo for convenience & interpretability: | |
114 | group.mean[j] <- grand.mean + group.delta[j] | |
115 | } | |
116 | ||
117 | for (i in 1:N) { | |
118 | # each individual observation is from the grand-mean + group-offset, then normally distributed: | |
119 | Y[i] ~ dnorm(grand.mean + group.delta[Group[i]], group.within.tau[Group[i]]) | |
120 | } | |
121 | ||
122 | } | |
123 | jagsData <- list(N=nrow(data), Y=data$Y, K=length(levels(data$Measurement)), | |
124 | Group=data$Measurement) | |
125 | params <- c("grand.mean","delta.between.group", "sigma.between.group", "group.delta", "group.mean", | |
126 | "group.within.sigma") | |
127 | k1 <- jags(data=jagsData, parameters.to.save=params, inits=NULL, model.file=model); k1 | |
128 | # ... mu.vect sd.vect 2.5% 25% 50% 75% 97.5% Rhat n.eff | |
129 | # delta.between.group 4.971 2.945 0.221 2.353 4.967 7.594 9.791 1.008 260 | |
130 | # grand.mean 52.477 11.321 23.453 47.914 53.280 58.246 74.080 1.220 20 | |
131 | # group.delta[1] 6.017 11.391 -16.095 0.448 5.316 10.059 34.792 1.152 21 | |
132 | # group.delta[2] 5.662 11.318 -15.836 0.054 5.009 10.107 33.548 1.139 21 | |
133 | # group.mean[1] 58.494 3.765 50.973 56.188 58.459 60.838 66.072 1.001 3000 | |
134 | # group.mean[2] 58.139 2.857 52.687 56.366 58.098 59.851 63.999 1.003 920 | |
135 | # group.within.sigma[1] 12.801 2.766 8.241 10.700 12.446 14.641 18.707 1.002 1100 | |
136 | # group.within.sigma[2] 9.274 2.500 5.688 7.475 8.834 10.539 15.700 1.002 1600 | |
137 | # sigma.between.group 18.031 21.159 0.553 3.793 9.359 23.972 82.604 1.006 1700 | |
138 | # deviance 149.684 2.877 145.953 147.527 149.081 151.213 156.933 1.001 3000 | |
139 | ||
140 | ## VOI | |
141 | ||
142 | posteriorXs <- k1$BUGSoutput$sims.list[["grand.mean"]] | |
143 | MSE <- function(x1, x2) { (x2 - x1)^2 } | |
144 | lossFunction <- function(x, predictions) { mean(sapply(predictions, function(x2) { MSE(x, x2)}))} | |
145 | ## our hypothetical mean-squared loss if we predicted, say, X=60: | |
146 | lossFunction(60, posteriorXs) | |
147 | # [1] 184.7087612 | |
148 | ## of the possible values for X, 1-100, what value of X minimizes our squared error loss? | |
149 | losses <- sapply(c(1:100), function (n) { lossFunction(n, posteriorXs);}) | |
150 | which.min(losses) | |
151 | - | ## 52 also equals the mean estimate of X, which is good since it's well known that the mean is what minimizes the loss when the loss is squared-error |
151 | + | |
152 | - | ## so it suggests that I have not screwed up the definitions |
152 | + | ## 52 also equals the mean estimate of X, which is good since it's well known that the mean is what minimizes |
153 | ## the loss when the loss is squared-error so it suggests that I have not screwed up the definitions | |
154 | losses[52] | |
155 | [1] 128.3478462 | |
156 | ||
157 | ## to calculate EVSI, we repeatedly simulate a few hundred times the existence of a hypothetical 'C' measurement | |
158 | ## and draw n samples from it; | |
159 | ## then we add the C data to our existing A & B data; run our Bayesian multilevel model again on the bigger dataset;, | |
160 | ## calculate what the new loss is, and compare it to the old loss to see how much the new data | |
161 | ## reduced the loss/mean-squared-error. | |
162 | ## Done for each possible n (here, 1-30) and averaged out, this tells us how much 1 additional datapoint is worth, | |
163 | ## 2 additional datapoints, 3 additional datapoints, etc. | |
164 | sampleValues <- NULL | |
165 | for (i in seq(from=1, to=30)) { | |
166 | ||
167 | evsis <- replicate(500, { | |
168 | n <- i | |
169 | ||
170 | C <- toplevel + runif(1, min=-10, max=10) | |
171 | C_1 <- rnorm(n=n, mean=C, sd=10) | |
172 | ## all as before, more or less: | |
173 | newData <- rbind(data, data.frame(Measurement="C", Y=C_1)) | |
174 | ||
175 | jagsData <- list(N=nrow(newData), Y=newData$Y, K=length(levels(newData$Measurement)), | |
176 | Group=newData$Measurement) | |
177 | params <- c("grand.mean","delta.between.group", "sigma.between.group", "group.delta", "group.mean", | |
178 | "group.within.sigma") | |
179 | jEVSI <- jags(data=jagsData, parameters.to.save=params, inits=NULL, model.file=model) | |
180 | ||
181 | posteriorTimesEVSI <- jEVSI$BUGSoutput$sims.list[["grand.mean"]] | |
182 | lossesEVSI <- sapply(c(1:100), function (n) { lossFunction(n, posteriorTimesEVSI);}) | |
183 | ||
184 | oldOptimum <- 128.3478462 # losses[52] | |
185 | newOptimum <- losses[which.min(lossesEVSI)] | |
186 | EVSI <- newOptimum - oldOptimum | |
187 | return(EVSI) | |
188 | } | |
189 | ) | |
190 | ||
191 | print(i) | |
192 | ||
193 | print(mean(evsis)) | |
194 | sampleValues[i] <- mean(evsis) | |
195 | } | |
196 | sampleValues | |
197 | # [1] 13.86568780 11.07101087 14.15645538 13.05296681 11.98902668 13.86866619 13.65059093 14.05991443 | |
198 | # [9] 14.80018511 16.36944874 15.47624541 15.64710237 15.74060632 14.79901214 13.36776390 15.35179426 | |
199 | # [17] 14.31603459 13.70914727 17.20433606 15.89925289 16.35350861 15.09886204 16.30680175 16.27032067 | |
200 | # [25] 16.30418553 18.84776433 17.86881713 16.65973397 17.04451609 19.17173439 | |
201 | ||
202 | ## As expected, the gain in reducing MSE continues increasing as data comes in but with diminishing returns; | |
203 | ## this is probably because in a multilevel model like this, you aren't using the _n_ datapoints to estimate X | |
204 | ## directly so much as you are using them to estimate a much smaller number of latent variables, which are then | |
205 | ## the _n_ used to estimate X. So instead of getting hyperprecise estimates of A/B/C, you need to sample from additional | |
206 | ## groups D/E/F/... Trying to improve your estimate of X by measuring A/B/C many times is like trying to estimate | |
207 | ## IQ precisely by administering a WM test a hundred times. | |
208 | ||
209 | - | ## requires much more data than usual. |
209 | + | ## If we wanted to compare with alternatives like instead sampling n data points from C and a D, it's easy to modify |
210 | ## the EVSI loop to do so: generate `D <- toplevel + runif(1, min=-10, max=10); D_1 <- rnorm(n=n, mean=D, sd=10)` | |
211 | ## and now `rbind` D_1 in as well. At a guess, after 5-10 samples from the current group, estimates of X will be improved more | |
212 | ## by then sampling from a new group. | |
213 | ||
214 | ## Or the loss function could be made more realistic. It's unlikely one is paid by MSE, and if one adds in how much | |
215 | ## money each sample costs, with a realistic loss function, one could decide exactly how much data is optimal to collect. | |
216 | ||
217 | ## To very precisely estimate X, when our measurements are needed to measure at least 3 latent variables, | |
218 | ## requires much more data than usual. | |
219 | ||
220 | ## In general, we can see the drawbacks and benefits of each approach. A canned MLM | |
221 | ## is very fast to write but doesn't let us include prior information or easily run | |
222 | ## additional analyses like how much additional samples are worth. SEM works poorly | |
223 | ## on small samples but is still easy to write in if we have more complicated | |
224 | ## models of measurement error. A full-blown modeling language like JAGS is quite | |
225 | ## difficult to write in and MCMC is slower than other approaches but handles small | |
226 | ## samples without any errors or problems and offers maximal flexibility in using | |
227 | ## the known prior information and then doing decision-theoretic stuff. Overall for | |
228 | ## this problem, I think JAGS worked out best, but possibly I wasn't using LAVAAN | |
229 | ## right and that's why SEM didn't seem to work well. |