Online Appendix # R code to run Monte Carlo simulations for "Two Multilevel Modeling Techniques for Analyzing Comparative Longitudinal Survey Datasets" # 30 March 2013 library(lme4.0) library(multicore) # for mclapply library(sn) # for rmsn N <- 25 # set number of countries n <- 500 # set number of people observed per country T <- 30 # set number of years in study period cwaves <- expand.grid(year=1:T, country=1:N) # create year and country indices cwaves$cwave <- apply(cwaves, 1, function(x) 100*x[2]+x[1]) # create cwave index c.mer <- function(mod) { c(fixef(mod), vcov(mod)@factors$correlation@sd, c(unlist(lapply(VarCorr(mod), diag)), attr(VarCorr(mod), "sc")^2)) } # function to extract FEs, SEs, REs # main model: yitj = B0 - B1*Xitj + B2*Xj - B3*timetj + B4*timetj*Xj + B5*(Xtj-Xj) + Uj + Utj + eitj # alternative model for comparing de facto pooling with centering: logit(yitj) = B0 - B1*Xitj + B2*Xj - B3*timetj + B4*(Xtj-Xj) + Uj + Utj dgp <- function(cwaves, N, T, n, s, ac, a, la, sam, waves, cof1) { # function to generate the data if(all(c(a!=0, sum(s-diag(diag(s)))!=0))) stop("Having both skew and non-zero covariance changes variance.") s <- s[rowSums(s)!=0,colSums(s)!=0] # to avoid an error when calling rmsn sh <- c(a, rep(0, ncol(s)-1)) # set shape parameters w <- s/(1-((2*(sh^2)/(1 + sh^2))/pi)) # determine scale parameter to get desired variances xi <- c(-sqrt(w)%*%sh/sqrt(1 + sh^2)*sqrt(2/pi)) # determine location parameter to get mean of 0 dat <- rmsn(n=N, xi=xi, Omega=w, alpha=sh) # depending on s and a, generates correlated data, or Skew Normal country intercepts if (ncol(dat)<3) dat <- cbind(dat, 0) dat <- data.frame(cwaves, dat[cwaves$country,]) # expands to N*T rows names(dat)[4:6] <- c("Uj", "Xj", "Xitj") # country-level random intercept, country mean X, and probability of Xitj being equal to one dat$Utj <- suppressWarnings(unlist(by(dat, dat$country, function(x) as.numeric(arima.sim(list(ar=ac), n=T, n.start=T))))) # set autocorrelation (if any) dat$Xtj <- do.call("c", by(dat, dat$country, function(x) (0:(T-1))*rnorm(1, 0.05, 0.05))) # time-varying covariate with a random linear trend dat$Xtj <- dat$Xj + dat$Xtj - as.numeric(by(dat, dat$country, function(x) mean(x$Xtj)))[dat$country] # country-wave-level covariate dat$XtjM <- dat$Xtj-dat$Xj # mean-centred version of Xtj dat$XtjMlag <- unlist(by(dat, dat$country, function(x) c(rep(NA,la),x$XtjM[1:(T-la)]))) # creates a version of XtjM with lag la if (sam) dat <- do.call("rbind", by(dat, dat$country, function(x) x[rep(sample(x$year[(la+1):T], waves, prob=plogis(dat$Xtj)[(la+1):T]),n/waves),])) else dat <- do.call("rbind", by(dat, dat$country, function(x) x[rep(sample(x$year[(la+1):T], waves),n/waves),])) # waves*N*n rows dat$Xitj <- rbinom(nrow(dat), 1, plogis(dat$Xitj)) # individual-level (binary) covariate dat$year <- dat$year/20 # simply to keep the variation proportional dat$y <- 1 - dat$Xitj + dat$Xj - dat$year + dat$year*dat$Xj + dat$XtjMlag + dat$Uj + dat$Utj + rnorm(nrow(dat), sd=3) # calculate Normally distributed outcome dat$ybi <- rbinom(nrow(dat), 1, plogis(1 - dat$Xitj + dat$Xj - dat$year + dat$year*dat$Xj + dat$XtjMlag + dat$Uj + dat$Utj)) # calculate (binary) outcome if (cof1!=-1) dat$ybi <- rbinom(nrow(dat), 1, plogis(1 - dat$Xitj + dat$Xj - dat$year + cof1*dat$XtjMlag + dat$Uj + dat$Utj)) # alternative model dat[order(dat$cwave, dat$Xitj),] } bin <- function(dat) { # function to aggregate binary responses by combination of fixed and random effects dat2 <- by(dat, list(dat$Xitj,dat$cwave), function(x) c(sum(x$ybi==1),sum(x$ybi==0))) dat2 <- data.frame(do.call("rbind", dat2)) names(dat2) <- c("successes", "failures") data.frame(dat2, dat[!duplicated(dat$cwave*100+dat$Xitj),]) } sims <- 1000 # set number of simulations per combination of conditions, for lme4 sim <- function(cwaves, N, T, n, s, ac, a, la, sam, logit, waves, cof1=-1) { # function to generate data, estimate a model with (g)lmer, and return the results dat <- dgp(cwaves=cwaves, N=N, T=T, n=n, s=s, ac=ac, a=a, la=la, sam=sam, waves=waves, cof1=cof1) # simulate data using dgp if (logit) { # fit logit model mod1 <- glmer(cbind(successes, failures) ~ Xitj + year*Xj + XtjM + (1 | cwave) + (1 | country), bin(dat), family=binomial) # with centering mod2 <- glmer(cbind(successes, failures) ~ Xitj + year + Xtj + (1 | cwave) + (1 | country), bin(dat), family=binomial) # no centering } if (!logit) { # fit linear model mod1 <- lmer(y ~ Xitj + year*Xj + XtjM + (1 | cwave) + (1 | country), dat) # with centering mod2 <- lmer(y ~ Xitj + year + Xtj + (1 | cwave) + (1 | country), dat) # no centering } c(c.mer(mod1), c.mer(mod2)) # return 6 FEs, 6 SEs, and 3 REs, for each model } # DGP conditions: s <- list(diag(c(2,1,0)), matrix(c(2,0.7,0,0.7,1,0,0,0,0), ncol=3), matrix(c(2,0,0.7,0,1,0,0.7,0,0.5), ncol=3)) # correlation ac <- c(0, 0.25) # autocorrelation in the country-wave random intercepts a <- c(0, 10) # skewness of the random intercepts la <- c(0, 5) # lag sam <- c(FALSE, TRUE) # weighted selection logit <- c(FALSE, TRUE) # binary outcome waves <- c(2, 5, 20) com <- expand.grid(s=s, ac=ac, a=a, la=la, sam=sam, logit=logit, waves=waves) com <- com[com$a==0 | sapply(com$s, function(ss) sum(ss[upper.tri(ss)]))==0,] # to avoid having both skew and non-zero covariance rownames(com) <- 1:nrow(com) res <- apply(com, 1, function(x) mclapply(1:sims, function(y) sim(cwaves=cwaves, N=N, T=T, n=n, s=x$s, ac=x$ac, a=x$a, la=x$la, sam=x$sam, logit=x$logit, waves=x$waves))) res <- lapply(res, function(x) do.call(rbind, x)) save(res, file="res.RData") ######## MCMC library(MCMCglmm) c.MC <- function(mod) { c(matrix(summary(mod)$solutions[,1:3]), apply(mod$Sol, 2, sd), summary(mod)$Gcovariances[,1], summary(mod)$Rcovariances[,1]) } # function to extract FEs (post.mean, l-95% CI, u-95% CI), SEs, REs priors1 <- list(R = list(V = 1, nu = 0.002), G = list(G1 = list(V =1, nu = 0.002), G2 = list(V =1, nu = 0.002))) simMC <- function(cwaves, N, T, n, s, ac, a, la, sam, waves, logit, cof1=-1) { # function to generate data, estimate a model with MCMCglmm, and return the results dat <- dgp(cwaves=cwaves, N=N, T=T, n=n, s=s, ac=ac, a=a, la=la, sam=sam, waves=waves, cof1=cof1) # simulate data using dgp if (logit) { # fit logit model mod1 <- MCMCglmm(cbind(successes,failures) ~ Xitj + year*Xj + XtjM, random = ~ cwave + country, data = bin(dat), family="multinomial2", verbose=F, prior=priors1) mod2 <- MCMCglmm(cbind(successes,failures) ~ Xitj + year + Xtj, random = ~ cwave + country, data = bin(dat), family="multinomial2", verbose=F, prior=priors1) } if (!logit) { # fit linear model mod1 <- MCMCglmm(y ~ Xitj + year*Xj + XtjM, random = ~ cwave + country, data = dat, verbose=F, prior=priors1) mod2 <- MCMCglmm(y ~ Xitj + year + Xtj, random = ~ cwave + country, data = dat, verbose=F, prior=priors1) } c(c.MC(mod1), c.MC(mod2)) # return 6 FEs, 6 SEs, and 3 REs, for each model } simsMC <- 300 # set number of simulations per combination of conditions, for MCMCglmm # DGP conditions for MCMC # replicating eight main DGPs above, plus 5 additional (each for 2, 5, and 20 country-waves per country) com <- com[c(c(1:4, 7, 9, 17, 33), c(1:4, 7, 9, 17, 33)+64, c(1:4, 7, 9, 17, 33)+128),] com$cof1 <- -1 cof1 <- seq(0, 1, by=0.25) comMC <- expand.grid(s=s, ac=ac[1], a=a[1], la=la[1], sam=sam[1], logit=logit[2], waves=waves, cof1=cof1)[c(3*1:15-2),] comMC <- rbind(com, comMC) rownames(comMC) <- 1:nrow(comMC) # run the simulations (parallelizing across as many cores as are available) resMC <- apply(comMC, 1, function(x) mclapply(1:simsMC, function(y) simMC(cwaves=cwaves, N=N, T=T, n=n, s=x$s, ac=x$ac, a=x$a, la=x$la, sam=x$sam, waves=x$waves, logit=x$logit, cof1=x$cof1))) resMC <- lapply(resMC, function(x) do.call(rbind, x)) # convert from a list of lists, to a list of matrices save(resMC, file="resMC.RData") q("no")