## code to reproduce Table 4 ## THIS FILE: SIMEX simulations for n=5000 library(splines) set.seed(4040) options(width=150) sessionInfo() objfile <- "~/simex-vfunc/code-files-for-psychometrika-paper-aug2016/code-for-table4-n5000.Robj" ################################################ ## function for deconvolution via NPMLE ################################################ npmle_deconv <- function(W, csem, xmin = -5, xmax = +5, ngrid = 2000, lambda = 0.0005, lltol = 1e-7, psmall = 0.00005, discrete = FALSE, quietly = FALSE){ ## nonparametric MLE of distribution of error-free variable "X" given ## error-prone variable "W" where W = X + U and U|X ~ N(0, csem^2(X)) ## ## uses algorithm described in Rabe-Hesketh et al (2003). "Correcting for ## covariate measurement error in logistic regression using nonparametric ## maximum likelihood estimation." Statistical Modelling 3: 215-232. ## ## INPUTS: ## W: vector of observed scores ## csem: function of single variable x that returns conditional standard deviation of W|X=x. ## needs to be able to accept a vector input and return vector output ## xmin, xmax, ngrid: define grid over which X distribution is calculated ## NOTE: default (xmin,xmax) are appropriate only for a standardized score; will need to be changed ## for typical reporting scales for test scores ## lambda: step in RH algorithm which is on probability scale - this seemed like a good value based on simulations ## lltol: when we can't find a step that improves the likelihood by more than lltol, we stop ## psmall: if a point evolves to have mass less than psmall we drop it ## discrete: TODO for future version ## quietly: if FALSE, prints iteration-by-iteration output ## ## VALUE: ## list(.history = history of probability distributions, pu = dataframe containing summary info of last distribution) stopifnot( all(!is.na(W)) && is.function(csem) && (xmin < xmax) && (lambda > 0) && (lltol > 0) ) if(discrete){ stop("discrete option to be implemented in future version") } if(min(W) < xmin){ stop("xmin exceeds smallest data value, consider decreasing xmin") } if(max(W) > xmax){ stop("largest data value exceeds xmax; consider increasing xmax") } ## NOTE: we collapse over repeat W values when possible, using "counts" to contribute to likelihood .W <- sort(unique(W)) nW <- length(.W) .counts <- sapply(.W, function(x){ sum(W == x) }) stopifnot(sum(.counts) == length(W)) W <- .W ## functions to map probabilities to reduced-dimension unconstrained scale, and inverse theta_to_p <- function(theta){ if(length(theta) == 0){ return(1) } else { p <- exp(theta) / (1 + sum(exp(theta))) return(c(p, 1-sum(p))) } } p_to_theta <- function(p){ stopifnot( all(p > 0) && (abs((1 - sum(p))) < 1e-9) ) if(length(p) == 1){ return(numeric(0)) } else { last <- length(p) return(log(p[1:(last-1)] * (1 + ((1-p[last])/p[last])) )) } } ## CHECKS: ## p <- c(0.1, 0.2, 0.4, 0.3); print(ma(p - theta_to_p(p_to_theta(p)))) ## theta <- c(-3.2, 1.0, -1.0); print(ma(theta - p_to_theta(theta_to_p(theta)))) ## print(ma(1 - theta_to_p(p_to_theta(1)))) ## p_to_theta(theta_to_p(numeric(0))) ## build (nW x ngrid) matrix of conditional densities if discrete == FALSE, or conditional probabilities if discrete == TRUE grid <- seq(from=xmin, to=xmax, length=ngrid) grid.csem <- csem(grid) if(!discrete){ fwx <- dnorm( matrix(W,ncol=ngrid,nrow=nW), mean=matrix(grid,ncol=ngrid,nrow=nW,byrow=T), sd=matrix(grid.csem,ncol=ngrid,nrow=nW,byrow=T)) } else { ## uses pxgu - fwx[i,j] = p(W=W[i] | X = grid[j]) tmp <- lapply(grid, function(u){ pxgu(u, csem(u), .W)}) stopifnot(all(sapply(tmp, function(x){ all(x[,1] == .W) }))) fwx <- do.call("cbind", lapply(tmp, function(x){ x[,2] })) } ## negative log likelihood for a set of probabilities given ".inds" which are ## indices of "grid" and which are continually updated ## NOTE: updated to use counts negll <- function(theta){ -sum(.counts * log(as.vector(fwx[,.inds] %*% theta_to_p(theta)))) } ## find initial best grid point. NOTE this will often want the X with the ## biggest CSEM because a single point is inconsistent with most W unless the ## CSEM is large. the exception is if we pick ugrid to be very large, then it ## will find interior point. however even if it picks an extreme starting ## point, that point will be dropped later if it is too far in the tails. ll <- apply(fwx, 2, function(x){ sum(.counts * log(x)) }) .inds <- which.max(ll) .probs <- 1 .eligible <- setdiff(1:ngrid, .inds) ll.current <- ll[.inds] .history <- list() .history[[1]] <- data.frame(x = grid[.inds], csem = grid.csem[.inds], p = 1, ll = ll.current, ex = grid[.inds], varx = 0) ## now add grid points per the algorithm until there is no improvement done <- FALSE while(!done){ ## evaluate each eligible point with a weight "lambda" part.static <- as.vector(fwx[,.inds,drop=FALSE] %*% ((1 - lambda)*.probs)) part.total <- matrix(part.static, ncol=length(.eligible), nrow=nW) + (fwx[,.eligible] * lambda) ll.candidate <- apply(part.total, 2, function(x){ sum(.counts * log(x)) }) if(all(ll.candidate - ll.current < lltol)){ done <- TRUE } else { w <- which.max(ll.candidate - ll.current) .inds <- c(.inds, .eligible[w]) .eligible <- setdiff(.eligible, .eligible[w]) ## set starting value: mass 0.05 at new value, normalized p on existing values o <- optim(p_to_theta(c(0.95*(.probs/sum(.probs)), 0.05)), negll, method = "BFGS", control = list(maxit=1000, trace=5*(1 - as.numeric(quietly)), REPORT = 1)) .probs <- theta_to_p(o$par) ll.current <- -o$value ## sometimes we might have picked up an early grid point but now it has virtually no probability, so dump it w <- which(.probs < psmall) if(length(w) > 0){ .probs <- .probs[-w] .probs <- .probs/sum(.probs) .inds <- .inds[-w] } ## reorder the grid if need be o <- order(.inds) .inds <- .inds[o] .probs <- .probs[o] ## summarize the state .history[[length(.history)+1]] <- data.frame(x = grid[.inds], csem = grid.csem[.inds], p = .probs, ll = ll.current, ex = sum(grid[.inds] * .probs), varx = sum(grid[.inds]^2 * .probs) - (sum(grid[.inds] * .probs))^2) if(!quietly){ print(.history[[length(.history)]]) cat("\n\n\n") } } } return(list(.history = .history, px = .history[[length(.history)]])) } expit<-function(x){ v<-exp(x) v/(1+v) } ################################################ ## initial work on CSEM function from real data ################################################ ## define CSEM function corresponding to Figure 1: case <- data.frame(score = c(232,236,239,242,245,248,251,253,256,258,260,263,265,267,269,271,273,275,277,279,281,283,285,286,288,290,292,294,296,298,300,302,304,306,309,311,313,316,318,321,324,327,330,333,337,341,350,351,357,365,376,393,423), csem = c(10,9,9,9,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,9,9,9,10,10,12,12,13,15,18,25,45)) case$cvar <- case$csem^2 ## interpolate variance function and go flat outside range of data varfunc <- approxfun(x = case$score, y = case$cvar, rule=2) ## check: ## x <- seq(from=0.8*min(case$score), to = 1.2*max(case$score), length=1000) ## plot(x, varfunc(x), type="l") ## points(case$score, case$cvar, col="blue") ## using raw achievement data (which cannot be posted) and deconvolution methods ## we estimated X in the population had mean 334.3 and standard deviation 32.8: mux <- 334.3 sdx <- 32.8 ################################################ ## use Monte Carlo methods to approximate A(W) ################################################ xgrid <- seq(from=min(case$score), to=max(case$score), length=200) nreps <- 30000 d <- data.frame(x=rep(xgrid, each=nreps)) d$target <- varfunc(d$x) d$w <- d$x + rnorm(nrow(d), mean=0, sd = sqrt(d$target)) s <- bs(d$w, df=10, intercept=TRUE) colnames(s) <- paste("phi",colnames(s),sep="") b <- aggregate(s, by=list(x=d$x), FUN=mean) b <- merge(b, unique(d[,c("x","target")]), all.x=TRUE) mod <- lm(as.formula(paste("target ~", paste(colnames(s), collapse=" + "), "-1")), data=b) d$fit <- s %*% coef(mod) A1 <- approxfun(x = d$w, y = d$fit, rule=2) ################################################ ## SIMEX simulation ################################################ n <- 5000 nsim <- 500 B <- 250 lambda <- seq(from=0, to=2, length=20) nlambda <- length(lambda) nparam <- 1 bigres <- vector(nsim, mode="list") for(simi in 1:nsim){ ## generate x d <- data.frame(x1 = rnorm(n, mean = mux, sd = sdx)) d$x1std <- (d$x1 - mux) / sdx ## generate response d$ps <- expit(0.0 - 1.5*d$x1std) d$R <- as.numeric(runif(n) < d$ps) ## generate W according to varfunc g1 <- varfunc d$w1 <- d$x1 + rnorm(n, sd = sqrt(g1(d$x1))) ## get deconvolution estimate of distribution of X. note use rounded W to ## speed up computations d$w1round <- round(d$w1, digits=1) pxhat <- npmle_deconv(d$w1round, csem = function(x){ sqrt(g1(x))}, xmin=min(d$w1round), xmax=max(d$w1round), quietly=TRUE)$px ## get pieces we need for various SIMEX d$egx <- var(d$w1) - unique(pxhat$varx) ## conditional mean of X given W (so we can calculate g(E[X|W]) f <- function(w){ pxw <- dnorm(w, mean = pxhat$x, sd = sqrt(g1(pxhat$x))) * pxhat$p pxw <- pxw / sum(pxw) return( sum(pxhat$x * pxw) ) } d$exgivenw <- sapply(d$w1, f) ## E[g(X) | W] f <- function(w){ pxw <- dnorm(w, mean = pxhat$x, sd = sqrt(g1(pxhat$x))) * pxhat$p pxw <- pxw / sum(pxw) return( sum(g1(pxhat$x) * pxw) ) } d$egxgivenw <- sapply(d$w1, f) ## function to get logistic regression coefficient for response esty <- function(whatcov){ stopifnot(whatcov %in% names(d)) d$reg.x1 <- (d[,whatcov] - mux) / sdx coef(glm(R ~ reg.x1, data=d, family="binomial"))[2] } ########################################################################### ## infeasible SIMEX using unobserved true variance for each case ########################################################################### rL.gx <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * g1(d$x1))) tmp[[.b]] <- esty("w1star") } rL.gx[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.gx <- as.data.frame(do.call("rbind",rL.gx)) ########################################################################### ## SIMEX using estimated average ME, ignoring heteroskedasticity ########################################################################### rL.avggx <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * d$egx)) tmp[[.b]] <- esty("w1star") } rL.avggx[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.avggx <- as.data.frame(do.call("rbind",rL.avggx)) ########################################################################### ## SIMEX using naive estimate of average ME variance ########################################################################### rL.avggw <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * mean(g1(d$w1)))) tmp[[.b]] <- esty("w1star") } rL.avggw[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.avggw <- as.data.frame(do.call("rbind",rL.avggw)) ########################################################################### ## SIMEX using g1(w) plug-in ########################################################################### rL.gw <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * g1(d$w1))) tmp[[.b]] <- esty("w1star") } rL.gw[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.gw <- as.data.frame(do.call("rbind",rL.gw)) ########################################################################### ## SIMEX using g1(E[X|W]) ########################################################################### rL.gexw <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * g1(d$exgivenw))) tmp[[.b]] <- esty("w1star") } rL.gexw[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.gexw <- as.data.frame(do.call("rbind",rL.gexw)) ########################################################################### ## SIMEX using A(W) ########################################################################### rL.aw <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * A1(d$w1))) tmp[[.b]] <- esty("w1star") } rL.aw[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.aw <- as.data.frame(do.call("rbind",rL.aw)) ########################################################################### ## SIMEX using E[g(X)|W] ########################################################################### rL.rcal <- vector(nlambda, mode="list") for(.l in 1:nlambda){ tmp <- vector(B, mode="list") for(.b in 1:B){ d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * d$egxgivenw)) tmp[[.b]] <- esty("w1star") } rL.rcal[[.l]] <- apply(do.call("rbind", tmp), 2, mean) } rL.rcal <- as.data.frame(do.call("rbind",rL.rcal)) ############################################################################ ## collect estimates ############################################################################ res.ideal <- data.frame(method = "ideal", b1 = esty("x1"), stringsAsFactors=FALSE) names(res.ideal) <- c("method","b1") res.naive <- data.frame(method = "naive", b1 = esty("w1"), stringsAsFactors=FALSE) names(res.naive) <- c("method","b1") res.regcal <- data.frame(method = "regcalib", b1 = esty("exgivenw"), stringsAsFactors=FALSE) names(res.regcal) <- c("method","b1") proj <- list(rL.gx = rL.gx, rL.avggx = rL.avggx, rL.avggw = rL.avggw, rL.gw = rL.gw, rL.gexw = rL.gexw, rL.aw = rL.aw, rL.rcal = rL.rcal) ## quadratic projections res.rest <- do.call("rbind", lapply( proj, function(x){ e <- rep(0,nparam) for(i in 1:length(e)){ ex <- data.frame(lam = lambda, e = x[,i]) synth <- data.frame(lam = seq(from=-1, to=2, length=1000)) mod2 <- lm(e ~ lam + I(lam^2), data=ex) synth$ehat <- predict(mod2, newdata=synth) e[i] <- synth$ehat[1] } e })) res.rest <- as.data.frame(res.rest) names(res.rest) <- c("b1") res.rest$method <- paste("simex2", sapply(strsplit(rownames(res.rest),"\\."), function(x){ x[2] }), sep="_") res.rest <- res.rest[,c("method","b1")] ## quartic projections res.rest4 <- do.call("rbind", lapply( proj, function(x){ e <- rep(0,nparam) for(i in 1:length(e)){ ex <- data.frame(lam = lambda, e = x[,i]) synth <- data.frame(lam = seq(from=-1, to=2, length=1000)) mod4 <- lm(e ~ lam + I(lam^2) + I(lam^3) + I(lam^4), data=ex) synth$ehat <- predict(mod4, newdata=synth) e[i] <- synth$ehat[1] } e })) res.rest4 <- as.data.frame(res.rest4) names(res.rest4) <- c("b1") res.rest4$method <- paste("simex4", sapply(strsplit(rownames(res.rest4),"\\."), function(x){ x[2] }), sep="_") res.rest4 <- res.rest4[,c("method","b1")] ## put the pieces together res <- rbind(res.ideal, res.naive, res.regcal, res.rest, res.rest4) res$sim <- simi bigres[[simi]] <- list(res = res, projections = proj) cat(paste("simi:",simi,"\n")) } save(bigres, file=objfile, compress=TRUE) ## now restrict to summary bigres <- do.call("rbind", lapply(bigres, function(x){ x$res })) ## summaries for(v in setdiff(names(bigres), c("method","sim"))){ bigres[,paste(v,"mean",sep="_")] <- ave(bigres[,v], bigres$method, FUN=mean) bigres[,paste(v,"sd",sep="_")] <- ave(bigres[,v], bigres$method, FUN=sd) } print(sums <- unique(bigres[,c("method",names(bigres)[grep("_",names(bigres))])]), digits=3) ## true value is -1.5