## code to reproduce Table 3
library(splines)
set.seed(3001)
options(width=150)
sessionInfo()

################################################
## initial work on CSEM function from real data
################################################

## define CSEM function corresponding to Figure 1:
case <- data.frame(score = c(232,236,239,242,245,248,251,253,256,258,260,263,265,267,269,271,273,275,277,279,281,283,285,286,288,290,292,294,296,298,300,302,304,306,309,311,313,316,318,321,324,327,330,333,337,341,350,351,357,365,376,393,423), csem  = c(10,9,9,9,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,9,9,9,10,10,12,12,13,15,18,25,45))
case$cvar <- case$csem^2

## interpolate variance function and go flat outside range of data
varfunc <- approxfun(x = case$score, y = case$cvar, rule=2)
## check:
## x <- seq(from=0.8*min(case$score), to = 1.2*max(case$score), length=1000)
## plot(x, varfunc(x), type="l")
## points(case$score, case$cvar, col="blue")

## using raw achievement data (which cannot be posted) and deconvolution methods
## we estimated X in the population had mean 334.3 and standard deviation 32.8:
mux <- 334.3
sdx <- 32.8

################################################
## use Monte Carlo methods to approximate A(W)
################################################
xgrid <- seq(from=min(case$score), to=max(case$score), length=200)
nreps <- 30000
d     <- data.frame(x=rep(xgrid, each=nreps))
d$target <- varfunc(d$x)
  
d$w <- d$x + rnorm(nrow(d), mean=0, sd = sqrt(d$target))
s   <- bs(d$w, df=10, intercept=TRUE)
colnames(s) <- paste("phi",colnames(s),sep="")
  
b <- aggregate(s, by=list(x=d$x), FUN=mean)
b <- merge(b, unique(d[,c("x","target")]), all.x=TRUE)
  
mod <- lm(as.formula(paste("target ~", paste(colnames(s), collapse=" + "), "-1")), data=b)
d$fit <- s %*% coef(mod)

A1 <- approxfun(x = d$w, y = d$fit, rule=2)

################################################
## Loop over lambda to create raw data
## break simulation into 5 chunks of 50,000,000 each to keep RAM manageable
################################################
nchunk    <- 5
n         <- 50000000
bigres    <- vector(nchunk, mode="list")

for(ch in 1:nchunk){
    lambdaseq <- c(1,2)
    nlam <- length(lambdaseq)
    res <- vector(nlam, mode="list")

    ## loop over lambda
    for(wh in 1:nlam){
        lambda <- lambdaseq[wh]
        
        cat(paste("\n\n#########################",ch,lambda,"#########################\n"))
        
        ## generate X
        d <- data.frame(x1 = rnorm(n, mean = mux, sd = sdx))
  
        ## generate Ws as conditionally normal given X
        g1   <- varfunc
        d$w1 <- d$x1 + rnorm(n, sd = sqrt(g1(d$x1)))
        
        ## get other plug-ins
        d$egx <- mean(g1(d$x1))
       
        ## brute force condition on w by rounding it to nearest integer
        d$wcat <- as.factor(round(d$w1, digits=0))
        d$exgivenw  <- ave(d$x1,     d$wcat)
        d$egxgivenw <- ave(g1(d$x1), d$wcat)
        
        #######################################################
        ## generate different values of W*
        ## mean center to get central moments later
        #######################################################
        ## 1) using true variance
        d$w1star.gx    <- d$w1 + rnorm(n, sd = sqrt(lambda * g1(d$x1))) - mux
        ## 2) fresh sample using right variance function
        d$w1star.fresh <- d$x1 + rnorm(n, sd = sqrt((1 + lambda) * g1(d$x1))) - mux
        ## 3) average g(X)
        d$w1star.avggx <- d$w1 + rnorm(n, sd = sqrt(lambda * d$egx)) - mux
        ## 4) average g(W)
        d$w1star.avggw <- d$w1 + rnorm(n, sd = sqrt(lambda * mean(g1(d$w1)))) - mux
        ## 5) g(W) plugin
        d$w1star.gw    <- d$w1 + rnorm(n, sd = sqrt(lambda * g1(d$w1))) - mux
        ## 6) g(E[X|W])
        d$w1star.gexw  <- d$w1 + rnorm(n, sd = sqrt(lambda * g1(d$exgivenw))) - mux
        ## 7) A(W)
        d$w1star.aw    <- d$w1 + rnorm(n, sd = sqrt(lambda * A1(d$w1)))  - mux
        ## 8) E[g(X)|W]
        d$w1star.rcal  <- d$w1 + rnorm(n, sd = sqrt(lambda * d$egxgivenw)) - mux
        
        
        m1 <- c(mean(d$w1star.gx), mean(d$w1star.gx^2), mean(d$w1star.gx^3), mean(d$w1star.gx^4), mean(d$w1star.gx^5), mean(d$w1star.gx^6), mean(d$w1star.gx^7),mean(d$w1star.gx^8))
        m2 <- c(mean(d$w1star.fresh), mean(d$w1star.fresh^2), mean(d$w1star.fresh^3), mean(d$w1star.fresh^4), mean(d$w1star.fresh^5), mean(d$w1star.fresh^6), mean(d$w1star.fresh^7),mean(d$w1star.fresh^8))
        m3 <- c(mean(d$w1star.avggx), mean(d$w1star.avggx^2), mean(d$w1star.avggx^3), mean(d$w1star.avggx^4), mean(d$w1star.avggx^5), mean(d$w1star.avggx^6), mean(d$w1star.avggx^7), mean(d$w1star.avggx^8))
        m4 <- c(mean(d$w1star.avggw), mean(d$w1star.avggw^2), mean(d$w1star.avggw^3), mean(d$w1star.avggw^4), mean(d$w1star.avggw^5), mean(d$w1star.avggw^6), mean(d$w1star.avggw^7),mean(d$w1star.avggw^8))
        m5 <- c(mean(d$w1star.gw), mean(d$w1star.gw^2), mean(d$w1star.gw^3), mean(d$w1star.gw^4), mean(d$w1star.gw^5), mean(d$w1star.gw^6), mean(d$w1star.gw^7),mean(d$w1star.gw^8))
        m6 <- c(mean(d$w1star.gexw), mean(d$w1star.gexw^2), mean(d$w1star.gexw^3), mean(d$w1star.gexw^4), mean(d$w1star.gexw^5), mean(d$w1star.gexw^6), mean(d$w1star.gexw^7),mean(d$w1star.gexw^8))
        m7 <- c(mean(d$w1star.aw), mean(d$w1star.aw^2), mean(d$w1star.aw^3), mean(d$w1star.aw^4), mean(d$w1star.aw^5), mean(d$w1star.aw^6), mean(d$w1star.aw^7), mean(d$w1star.aw^8))
        m8 <- c(mean(d$w1star.rcal), mean(d$w1star.rcal^2), mean(d$w1star.rcal^3), mean(d$w1star.rcal^4), mean(d$w1star.rcal^5), mean(d$w1star.rcal^6), mean(d$w1star.rcal^7), mean(d$w1star.rcal^8))
        
        m <- round(rbind(m1, m2, m3, m4, m5, m6, m7, m8), digits=3)
        colnames(m) <- paste("mom",1:8,sep="")
        print(tmp <- data.frame(lambda = lambda, method=c("gx","fresh","avggx","avggw","gw","gexw","aw","rcal"), m))
        
        ## rearrange tmp to align with table order in paper and save
        res[[wh]] <- tmp[c(1,2,7,3,8,5,4,6),]
        rm(d)
        gc()
    }

    ## store results for this chunk
    bigres[[ch]] <- res
    rm(res)
    gc()
}

################################################
## Create table
################################################
res <- vector(nchunk, mode="list")
for(ch in 1:nchunk){
    res[[ch]]     <- rbind(bigres[[ch]][[1]], bigres[[ch]][[2]])
    res[[ch]]$sim <- ch
}

res <- do.call("rbind", res)
res$mom1 <- NULL
moms <- names(res)[grep("mom",names(res))]
res <- subset(res, method != "fresh")

## average moments across simulation replications
for(v in moms){
    res[,v] <- ave(res[,v], list(res$lambda, res$method))
}
res <- unique(res[,c("lambda","method",moms)])
res$index <- 1:nrow(res)
  
## get benchmark moments from gx cases and merge them onto res
b <- subset(res, method=="gx")
names(b)[which(names(b) %in% moms)] <- paste(names(b)[which(names(b) %in% moms)],"bench",sep="")
b$index <- NULL
b$method <- NULL
res <- merge(res, b, all.x=TRUE)
res <- res[order(res$index),]
  
## calculate percent bias
for(v in moms){
    res[,v] <- round(100 * (res[,v] - res[,paste(v,"bench",sep="")]) / res[,paste(v,"bench",sep="")], digits=0)
}
  
## reorganize table, get rid of irrelevant columns
res <- subset(res, method != "gx", select=c("method","lambda",moms))
  
## resort to put the different lambdas adjacent for the same method
nest <- nrow(res)/2
print(res[c(t(cbind(1:nest, (nest+1):nrow(res)))),])