## code to reproduce Table 3 library(splines) set.seed(3001) options(width=150) sessionInfo() ################################################ ## initial work on CSEM function from real data ################################################ ## define CSEM function corresponding to Figure 1: case <- data.frame(score = c(232,236,239,242,245,248,251,253,256,258,260,263,265,267,269,271,273,275,277,279,281,283,285,286,288,290,292,294,296,298,300,302,304,306,309,311,313,316,318,321,324,327,330,333,337,341,350,351,357,365,376,393,423), csem = c(10,9,9,9,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,9,9,9,10,10,12,12,13,15,18,25,45)) case$cvar <- case$csem^2 ## interpolate variance function and go flat outside range of data varfunc <- approxfun(x = case$score, y = case$cvar, rule=2) ## check: ## x <- seq(from=0.8*min(case$score), to = 1.2*max(case$score), length=1000) ## plot(x, varfunc(x), type="l") ## points(case$score, case$cvar, col="blue") ## using raw achievement data (which cannot be posted) and deconvolution methods ## we estimated X in the population had mean 334.3 and standard deviation 32.8: mux <- 334.3 sdx <- 32.8 ################################################ ## use Monte Carlo methods to approximate A(W) ################################################ xgrid <- seq(from=min(case$score), to=max(case$score), length=200) nreps <- 30000 d <- data.frame(x=rep(xgrid, each=nreps)) d$target <- varfunc(d$x) d$w <- d$x + rnorm(nrow(d), mean=0, sd = sqrt(d$target)) s <- bs(d$w, df=10, intercept=TRUE) colnames(s) <- paste("phi",colnames(s),sep="") b <- aggregate(s, by=list(x=d$x), FUN=mean) b <- merge(b, unique(d[,c("x","target")]), all.x=TRUE) mod <- lm(as.formula(paste("target ~", paste(colnames(s), collapse=" + "), "-1")), data=b) d$fit <- s %*% coef(mod) A1 <- approxfun(x = d$w, y = d$fit, rule=2) ################################################ ## Loop over lambda to create raw data ## break simulation into 5 chunks of 50,000,000 each to keep RAM manageable ################################################ nchunk <- 5 n <- 50000000 bigres <- vector(nchunk, mode="list") for(ch in 1:nchunk){ lambdaseq <- c(1,2) nlam <- length(lambdaseq) res <- vector(nlam, mode="list") ## loop over lambda for(wh in 1:nlam){ lambda <- lambdaseq[wh] cat(paste("\n\n#########################",ch,lambda,"#########################\n")) ## generate X d <- data.frame(x1 = rnorm(n, mean = mux, sd = sdx)) ## generate Ws as conditionally normal given X g1 <- varfunc d$w1 <- d$x1 + rnorm(n, sd = sqrt(g1(d$x1))) ## get other plug-ins d$egx <- mean(g1(d$x1)) ## brute force condition on w by rounding it to nearest integer d$wcat <- as.factor(round(d$w1, digits=0)) d$exgivenw <- ave(d$x1, d$wcat) d$egxgivenw <- ave(g1(d$x1), d$wcat) ####################################################### ## generate different values of W* ## mean center to get central moments later ####################################################### ## 1) using true variance d$w1star.gx <- d$w1 + rnorm(n, sd = sqrt(lambda * g1(d$x1))) - mux ## 2) fresh sample using right variance function d$w1star.fresh <- d$x1 + rnorm(n, sd = sqrt((1 + lambda) * g1(d$x1))) - mux ## 3) average g(X) d$w1star.avggx <- d$w1 + rnorm(n, sd = sqrt(lambda * d$egx)) - mux ## 4) average g(W) d$w1star.avggw <- d$w1 + rnorm(n, sd = sqrt(lambda * mean(g1(d$w1)))) - mux ## 5) g(W) plugin d$w1star.gw <- d$w1 + rnorm(n, sd = sqrt(lambda * g1(d$w1))) - mux ## 6) g(E[X|W]) d$w1star.gexw <- d$w1 + rnorm(n, sd = sqrt(lambda * g1(d$exgivenw))) - mux ## 7) A(W) d$w1star.aw <- d$w1 + rnorm(n, sd = sqrt(lambda * A1(d$w1))) - mux ## 8) E[g(X)|W] d$w1star.rcal <- d$w1 + rnorm(n, sd = sqrt(lambda * d$egxgivenw)) - mux m1 <- c(mean(d$w1star.gx), mean(d$w1star.gx^2), mean(d$w1star.gx^3), mean(d$w1star.gx^4), mean(d$w1star.gx^5), mean(d$w1star.gx^6), mean(d$w1star.gx^7),mean(d$w1star.gx^8)) m2 <- c(mean(d$w1star.fresh), mean(d$w1star.fresh^2), mean(d$w1star.fresh^3), mean(d$w1star.fresh^4), mean(d$w1star.fresh^5), mean(d$w1star.fresh^6), mean(d$w1star.fresh^7),mean(d$w1star.fresh^8)) m3 <- c(mean(d$w1star.avggx), mean(d$w1star.avggx^2), mean(d$w1star.avggx^3), mean(d$w1star.avggx^4), mean(d$w1star.avggx^5), mean(d$w1star.avggx^6), mean(d$w1star.avggx^7), mean(d$w1star.avggx^8)) m4 <- c(mean(d$w1star.avggw), mean(d$w1star.avggw^2), mean(d$w1star.avggw^3), mean(d$w1star.avggw^4), mean(d$w1star.avggw^5), mean(d$w1star.avggw^6), mean(d$w1star.avggw^7),mean(d$w1star.avggw^8)) m5 <- c(mean(d$w1star.gw), mean(d$w1star.gw^2), mean(d$w1star.gw^3), mean(d$w1star.gw^4), mean(d$w1star.gw^5), mean(d$w1star.gw^6), mean(d$w1star.gw^7),mean(d$w1star.gw^8)) m6 <- c(mean(d$w1star.gexw), mean(d$w1star.gexw^2), mean(d$w1star.gexw^3), mean(d$w1star.gexw^4), mean(d$w1star.gexw^5), mean(d$w1star.gexw^6), mean(d$w1star.gexw^7),mean(d$w1star.gexw^8)) m7 <- c(mean(d$w1star.aw), mean(d$w1star.aw^2), mean(d$w1star.aw^3), mean(d$w1star.aw^4), mean(d$w1star.aw^5), mean(d$w1star.aw^6), mean(d$w1star.aw^7), mean(d$w1star.aw^8)) m8 <- c(mean(d$w1star.rcal), mean(d$w1star.rcal^2), mean(d$w1star.rcal^3), mean(d$w1star.rcal^4), mean(d$w1star.rcal^5), mean(d$w1star.rcal^6), mean(d$w1star.rcal^7), mean(d$w1star.rcal^8)) m <- round(rbind(m1, m2, m3, m4, m5, m6, m7, m8), digits=3) colnames(m) <- paste("mom",1:8,sep="") print(tmp <- data.frame(lambda = lambda, method=c("gx","fresh","avggx","avggw","gw","gexw","aw","rcal"), m)) ## rearrange tmp to align with table order in paper and save res[[wh]] <- tmp[c(1,2,7,3,8,5,4,6),] rm(d) gc() } ## store results for this chunk bigres[[ch]] <- res rm(res) gc() } ################################################ ## Create table ################################################ res <- vector(nchunk, mode="list") for(ch in 1:nchunk){ res[[ch]] <- rbind(bigres[[ch]][[1]], bigres[[ch]][[2]]) res[[ch]]$sim <- ch } res <- do.call("rbind", res) res$mom1 <- NULL moms <- names(res)[grep("mom",names(res))] res <- subset(res, method != "fresh") ## average moments across simulation replications for(v in moms){ res[,v] <- ave(res[,v], list(res$lambda, res$method)) } res <- unique(res[,c("lambda","method",moms)]) res$index <- 1:nrow(res) ## get benchmark moments from gx cases and merge them onto res b <- subset(res, method=="gx") names(b)[which(names(b) %in% moms)] <- paste(names(b)[which(names(b) %in% moms)],"bench",sep="") b$index <- NULL b$method <- NULL res <- merge(res, b, all.x=TRUE) res <- res[order(res$index),] ## calculate percent bias for(v in moms){ res[,v] <- round(100 * (res[,v] - res[,paste(v,"bench",sep="")]) / res[,paste(v,"bench",sep="")], digits=0) } ## reorganize table, get rid of irrelevant columns res <- subset(res, method != "gx", select=c("method","lambda",moms)) ## resort to put the different lambdas adjacent for the same method nest <- nrow(res)/2 print(res[c(t(cbind(1:nest, (nest+1):nrow(res)))),])