## code to reproduce Table 4
## THIS FILE: SIMEX simulations for n=5000
library(splines)
set.seed(4040)
options(width=150)
sessionInfo()
objfile <- "~/simex-vfunc/code-files-for-psychometrika-paper-aug2016/code-for-table4-n5000.Robj"

################################################
## function for deconvolution via NPMLE
################################################

npmle_deconv <- function(W, csem, xmin = -5, xmax = +5, ngrid = 2000, lambda = 0.0005, lltol = 1e-7, psmall = 0.00005, discrete = FALSE, quietly = FALSE){
  ## nonparametric MLE of distribution of error-free variable "X" given
  ## error-prone variable "W" where W = X + U and U|X ~ N(0, csem^2(X))
  ##
  ## uses algorithm described in Rabe-Hesketh et al (2003). "Correcting for
  ## covariate measurement error in logistic regression using nonparametric
  ## maximum likelihood estimation." Statistical Modelling 3: 215-232.
  ##
  ## INPUTS:
  ## W: vector of observed scores
  ## csem: function of single variable x that returns conditional standard deviation of W|X=x.
  ##       needs to be able to accept a vector input and return vector output
  ## xmin, xmax, ngrid: define grid over which X distribution is calculated
  ##       NOTE: default (xmin,xmax) are appropriate only for a standardized score; will need to be changed
  ##       for typical reporting scales for test scores
  ## lambda: step in RH algorithm which is on probability scale - this seemed like a good value based on simulations
  ## lltol: when we can't find a step that improves the likelihood by more than lltol, we stop
  ## psmall: if a point evolves to have mass less than psmall we drop it
  ## discrete: TODO for future version
  ## quietly: if FALSE, prints iteration-by-iteration output
  ##
  ## VALUE:
  ## list(.history = history of probability distributions, pu = dataframe containing summary info of last distribution)
  stopifnot( all(!is.na(W)) && is.function(csem) && (xmin < xmax) && (lambda > 0) && (lltol > 0) )
  if(discrete){
    stop("discrete option to be implemented in future version")
  }
  if(min(W) < xmin){
    stop("xmin exceeds smallest data value, consider decreasing xmin")
  }
  if(max(W) > xmax){
    stop("largest data value exceeds xmax; consider increasing xmax")
  }
  
  ## NOTE: we collapse over repeat W values when possible, using "counts" to contribute to likelihood
  .W <- sort(unique(W))
  nW <- length(.W)
  .counts <- sapply(.W, function(x){ sum(W == x) })
  stopifnot(sum(.counts) == length(W))
  W <- .W

  ## functions to map probabilities to reduced-dimension unconstrained scale, and inverse
  theta_to_p <- function(theta){
    if(length(theta) == 0){
      return(1)
    } else {
      p <- exp(theta) / (1 + sum(exp(theta)))
      return(c(p, 1-sum(p)))
    }
  }
  
  p_to_theta <- function(p){
    stopifnot( all(p > 0) && (abs((1 - sum(p))) < 1e-9) )
    if(length(p) == 1){
      return(numeric(0))
    } else {
      last <- length(p)
      return(log(p[1:(last-1)] * (1 + ((1-p[last])/p[last])) ))
    }
  }
  
  ## CHECKS:
  ## p <- c(0.1, 0.2, 0.4, 0.3); print(ma(p - theta_to_p(p_to_theta(p))))
  ## theta <- c(-3.2, 1.0, -1.0); print(ma(theta - p_to_theta(theta_to_p(theta))))
  ## print(ma(1 - theta_to_p(p_to_theta(1))))
  ## p_to_theta(theta_to_p(numeric(0)))

  ## build (nW x ngrid) matrix of conditional densities if discrete == FALSE, or conditional probabilities if discrete == TRUE
  grid      <- seq(from=xmin, to=xmax, length=ngrid)
  grid.csem <- csem(grid)
  if(!discrete){
    fwx <- dnorm( matrix(W,ncol=ngrid,nrow=nW), mean=matrix(grid,ncol=ngrid,nrow=nW,byrow=T), sd=matrix(grid.csem,ncol=ngrid,nrow=nW,byrow=T))
  } else { ## uses pxgu - fwx[i,j] = p(W=W[i] | X = grid[j])
    tmp <- lapply(grid, function(u){ pxgu(u, csem(u), .W)})
    stopifnot(all(sapply(tmp, function(x){ all(x[,1] == .W) })))
    fwx <- do.call("cbind", lapply(tmp, function(x){ x[,2] }))
  }
  
  ## negative log likelihood for a set of probabilities given ".inds" which are
  ## indices of "grid" and which are continually updated
  ## NOTE: updated to use counts
  negll <- function(theta){
    -sum(.counts * log(as.vector(fwx[,.inds] %*% theta_to_p(theta))))
  }

  ## find initial best grid point.  NOTE this will often want the X with the
  ## biggest CSEM because a single point is inconsistent with most W unless the
  ## CSEM is large. the exception is if we pick ugrid to be very large, then it
  ## will find interior point.  however even if it picks an extreme starting
  ## point, that point will be dropped later if it is too far in the tails.
  ll <- apply(fwx, 2, function(x){ sum(.counts * log(x)) })
  .inds <- which.max(ll)
  .probs   <- 1
  .eligible <- setdiff(1:ngrid, .inds)
  ll.current <- ll[.inds]
  
  .history <- list()
  .history[[1]] <- data.frame(x = grid[.inds], csem = grid.csem[.inds], p = 1, ll = ll.current, ex = grid[.inds], varx = 0)

  ## now add grid points per the algorithm until there is no improvement
  done <- FALSE  
  while(!done){
    ## evaluate each eligible point with a weight "lambda"
    part.static   <- as.vector(fwx[,.inds,drop=FALSE] %*% ((1 - lambda)*.probs))
    part.total   <- matrix(part.static, ncol=length(.eligible), nrow=nW) + (fwx[,.eligible] * lambda)
    ll.candidate <- apply(part.total, 2, function(x){ sum(.counts * log(x)) })
    if(all(ll.candidate - ll.current < lltol)){
      done <- TRUE
    } else {
      w <- which.max(ll.candidate - ll.current)
      .inds <- c(.inds, .eligible[w])
      .eligible <- setdiff(.eligible, .eligible[w])
      
      ## set starting value: mass 0.05 at new value, normalized p on existing values
      o <- optim(p_to_theta(c(0.95*(.probs/sum(.probs)), 0.05)), negll, method  = "BFGS", control = list(maxit=1000, trace=5*(1 - as.numeric(quietly)), REPORT = 1))
      .probs <- theta_to_p(o$par)
      ll.current <- -o$value
      
      ## sometimes we might have picked up an early grid point but now it has virtually no probability, so dump it
      w <- which(.probs < psmall)
      if(length(w) > 0){
        .probs <- .probs[-w]
        .probs <- .probs/sum(.probs)
        .inds  <- .inds[-w]
      }
      
      ## reorder the grid if need be
      o <- order(.inds)
      .inds <- .inds[o]
      .probs <- .probs[o]

      ## summarize the state
      .history[[length(.history)+1]] <- data.frame(x = grid[.inds],
                                                   csem = grid.csem[.inds],
                                                   p = .probs,
                                                   ll = ll.current,
                                                   ex = sum(grid[.inds] * .probs),
                                                   varx = sum(grid[.inds]^2 * .probs) - (sum(grid[.inds] * .probs))^2)
      if(!quietly){
        print(.history[[length(.history)]])
        cat("\n\n\n")      
      }
    }
  }
  return(list(.history = .history, px = .history[[length(.history)]]))
}

expit<-function(x){
  v<-exp(x)
  v/(1+v)
}

################################################
## initial work on CSEM function from real data
################################################

## define CSEM function corresponding to Figure 1:
case <- data.frame(score = c(232,236,239,242,245,248,251,253,256,258,260,263,265,267,269,271,273,275,277,279,281,283,285,286,288,290,292,294,296,298,300,302,304,306,309,311,313,316,318,321,324,327,330,333,337,341,350,351,357,365,376,393,423), csem  = c(10,9,9,9,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,9,9,9,10,10,12,12,13,15,18,25,45))
case$cvar <- case$csem^2

## interpolate variance function and go flat outside range of data
varfunc <- approxfun(x = case$score, y = case$cvar, rule=2)
## check:
## x <- seq(from=0.8*min(case$score), to = 1.2*max(case$score), length=1000)
## plot(x, varfunc(x), type="l")
## points(case$score, case$cvar, col="blue")

## using raw achievement data (which cannot be posted) and deconvolution methods
## we estimated X in the population had mean 334.3 and standard deviation 32.8:
mux <- 334.3
sdx <- 32.8

################################################
## use Monte Carlo methods to approximate A(W)
################################################
xgrid <- seq(from=min(case$score), to=max(case$score), length=200)
nreps <- 30000
d     <- data.frame(x=rep(xgrid, each=nreps))
d$target <- varfunc(d$x)
  
d$w <- d$x + rnorm(nrow(d), mean=0, sd = sqrt(d$target))
s   <- bs(d$w, df=10, intercept=TRUE)
colnames(s) <- paste("phi",colnames(s),sep="")
  
b <- aggregate(s, by=list(x=d$x), FUN=mean)
b <- merge(b, unique(d[,c("x","target")]), all.x=TRUE)
  
mod <- lm(as.formula(paste("target ~", paste(colnames(s), collapse=" + "), "-1")), data=b)
d$fit <- s %*% coef(mod)

A1 <- approxfun(x = d$w, y = d$fit, rule=2)

################################################
## SIMEX simulation
################################################
n     <- 5000
nsim  <- 500
B     <- 250
lambda <- seq(from=0, to=2, length=20)
nlambda <- length(lambda)
nparam <- 1

bigres <- vector(nsim, mode="list")

for(simi in 1:nsim){

  ## generate x
  d       <- data.frame(x1 = rnorm(n, mean = mux, sd = sdx))
  d$x1std <- (d$x1 - mux) / sdx

  ## generate response
  d$ps    <- expit(0.0 - 1.5*d$x1std)
  d$R     <- as.numeric(runif(n) < d$ps)

  ## generate W according to varfunc
  g1 <- varfunc
  d$w1 <- d$x1 + rnorm(n, sd = sqrt(g1(d$x1)))

  ## get deconvolution estimate of distribution of X.  note use rounded W to
  ## speed up computations
  d$w1round <- round(d$w1, digits=1)
  pxhat <- npmle_deconv(d$w1round, csem = function(x){ sqrt(g1(x))}, xmin=min(d$w1round), xmax=max(d$w1round), quietly=TRUE)$px  

  ## get pieces we need for various SIMEX
  d$egx <- var(d$w1) - unique(pxhat$varx)

  ## conditional mean of X given W (so we can calculate g(E[X|W])
  f <- function(w){
    pxw <- dnorm(w, mean = pxhat$x, sd = sqrt(g1(pxhat$x))) * pxhat$p
    pxw <- pxw / sum(pxw)
    return( sum(pxhat$x * pxw) )
  }
  d$exgivenw <- sapply(d$w1, f)

  ## E[g(X) | W]
  f <- function(w){
    pxw <- dnorm(w, mean = pxhat$x, sd = sqrt(g1(pxhat$x))) * pxhat$p
    pxw <- pxw / sum(pxw)
    return( sum(g1(pxhat$x) * pxw) )
  }
  d$egxgivenw <- sapply(d$w1, f)

  ## function to get logistic regression coefficient for response
  esty <- function(whatcov){
    stopifnot(whatcov %in% names(d))
    d$reg.x1 <- (d[,whatcov] - mux) / sdx
    coef(glm(R ~ reg.x1, data=d, family="binomial"))[2]
  }
  
  ###########################################################################
  ## infeasible SIMEX using unobserved true variance for each case
  ###########################################################################
  rL.gx <- vector(nlambda, mode="list")
  
  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * g1(d$x1)))
      tmp[[.b]] <-  esty("w1star")
    }
    rL.gx[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.gx <- as.data.frame(do.call("rbind",rL.gx))

  ###########################################################################
  ## SIMEX using estimated average ME, ignoring heteroskedasticity
  ###########################################################################
  rL.avggx <- vector(nlambda, mode="list")

  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * d$egx))
      tmp[[.b]] <-  esty("w1star")
    }
    rL.avggx[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.avggx <- as.data.frame(do.call("rbind",rL.avggx))

  ###########################################################################
  ## SIMEX using naive estimate of average ME variance
  ###########################################################################
  rL.avggw <- vector(nlambda, mode="list")

  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * mean(g1(d$w1))))
      tmp[[.b]] <-  esty("w1star")
    }
    rL.avggw[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.avggw <- as.data.frame(do.call("rbind",rL.avggw))
  
  ###########################################################################
  ## SIMEX using g1(w) plug-in
  ###########################################################################
  rL.gw <- vector(nlambda, mode="list")

  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * g1(d$w1)))
      tmp[[.b]] <- esty("w1star")
    }
    rL.gw[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.gw <- as.data.frame(do.call("rbind",rL.gw))

  ###########################################################################
  ## SIMEX using g1(E[X|W])
  ###########################################################################
  rL.gexw <- vector(nlambda, mode="list")

  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * g1(d$exgivenw)))
      tmp[[.b]] <-  esty("w1star")
    }
    rL.gexw[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.gexw <- as.data.frame(do.call("rbind",rL.gexw))
  
  ###########################################################################
  ## SIMEX using A(W)
  ###########################################################################
  rL.aw <- vector(nlambda, mode="list")

  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * A1(d$w1)))
      tmp[[.b]] <-  esty("w1star")
    }
    rL.aw[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.aw <- as.data.frame(do.call("rbind",rL.aw))

  ###########################################################################
  ## SIMEX using E[g(X)|W]
  ###########################################################################
  rL.rcal <- vector(nlambda, mode="list")

  for(.l in 1:nlambda){
    tmp <- vector(B, mode="list")
    for(.b in 1:B){
      d$w1star <- d$w1 + rnorm(n, sd = sqrt(lambda[.l] * d$egxgivenw))
      tmp[[.b]] <-  esty("w1star")
    }
    rL.rcal[[.l]] <- apply(do.call("rbind", tmp), 2, mean)
  }
  rL.rcal <- as.data.frame(do.call("rbind",rL.rcal))
  
  ############################################################################
  ## collect estimates
  ############################################################################
  res.ideal <- data.frame(method = "ideal", b1 = esty("x1"), stringsAsFactors=FALSE)
  names(res.ideal) <- c("method","b1")
  
  res.naive <- data.frame(method = "naive", b1 = esty("w1"), stringsAsFactors=FALSE)
  names(res.naive) <- c("method","b1")

  res.regcal <- data.frame(method = "regcalib", b1 = esty("exgivenw"), stringsAsFactors=FALSE)
  names(res.regcal) <- c("method","b1")
  
  proj <- list(rL.gx = rL.gx,
               rL.avggx = rL.avggx,
               rL.avggw = rL.avggw,
               rL.gw = rL.gw,
               rL.gexw = rL.gexw,
               rL.aw = rL.aw,
               rL.rcal = rL.rcal)

  ## quadratic projections
  res.rest <- do.call("rbind", lapply( proj, function(x){
    e <- rep(0,nparam)
    for(i in 1:length(e)){
      ex <- data.frame(lam = lambda, e = x[,i])
      synth <- data.frame(lam = seq(from=-1, to=2, length=1000))
      mod2 <- lm(e ~ lam + I(lam^2), data=ex)
      synth$ehat <- predict(mod2, newdata=synth)
      e[i] <- synth$ehat[1]
    }
    e
  }))
  res.rest <- as.data.frame(res.rest)
  names(res.rest) <- c("b1")
  res.rest$method <- paste("simex2", sapply(strsplit(rownames(res.rest),"\\."), function(x){ x[2] }), sep="_")
  res.rest <- res.rest[,c("method","b1")]

  ## quartic projections
  res.rest4 <- do.call("rbind", lapply( proj, function(x){
    e <- rep(0,nparam)
    for(i in 1:length(e)){
      ex <- data.frame(lam = lambda, e = x[,i])
      synth <- data.frame(lam = seq(from=-1, to=2, length=1000))
      mod4 <- lm(e ~ lam + I(lam^2) + I(lam^3) + I(lam^4), data=ex)
      synth$ehat <- predict(mod4, newdata=synth)
      e[i] <- synth$ehat[1]
    }
    e
  }))
  res.rest4 <- as.data.frame(res.rest4)
  names(res.rest4) <- c("b1")
  res.rest4$method <- paste("simex4", sapply(strsplit(rownames(res.rest4),"\\."), function(x){ x[2] }), sep="_")
  res.rest4 <- res.rest4[,c("method","b1")]
  
  ## put the pieces together 
  res <- rbind(res.ideal, res.naive, res.regcal, res.rest, res.rest4)
  res$sim <- simi
  bigres[[simi]] <- list(res = res, projections = proj)
  cat(paste("simi:",simi,"\n"))
}

save(bigres, file=objfile, compress=TRUE)

## now restrict to summary
bigres <- do.call("rbind", lapply(bigres, function(x){ x$res }))

## summaries
for(v in setdiff(names(bigres), c("method","sim"))){
  bigres[,paste(v,"mean",sep="_")] <- ave(bigres[,v], bigres$method, FUN=mean)
  bigres[,paste(v,"sd",sep="_")]   <- ave(bigres[,v], bigres$method, FUN=sd)
}

print(sums <- unique(bigres[,c("method",names(bigres)[grep("_",names(bigres))])]), digits=3)
## true value is -1.5