# Authors: Kristin Porter and Susan Gruber # Date: September 25, 2010 # This program can be used to generate data according to the data-generating # distributions described in Petersen, ML, Porter, KE, Gruber S, Wang Y, and # van der Laan, MJ.(2010)."Diagnosing and Responding to Violations of the # Positivity Assumption," Statistical Methods in Medical Research (in press), # and a technical report available at http://www.bepress.com/ucbbiostat/paper269. # The simulation described in the paper corresponds to Simulation 2 in the tech # report, and Simulation 2 below. # # Functions needed for the analysis are in "pboot.R," available at # http://www.stat.berkeley.edu/users/laan/Software/ #---------------------------------------------------------------------------- # DATA GENERATION #---------------------------------------------------------------------------- # Simulation 1: Generate a sample of size n, (n=1000 by default) # W = W1,W2, bivariate normal, mu1=0.5, mu2=1, sigma = (2,1,1,1) # A = PHI(0.3*(0.5 + 0.25W1 + 0.75W2)), (probit model) # Y = 1 + A + W1 + 2W2 + epsilon, epsilon~N(0,1) # psi0 = 1 generate_sim1 <- function(n=1000){ sigma <- matrix(c(2, 1, 1, 1), ncol=2) W <- matrix(rnorm(n*2), ncol = nrow(sigma)) %*% chol(sigma) W <- W + matrix(rep(c(.5, 1),each=n), byrow=FALSE, ncol=2) colnames(W) <- c("W1", "W2") A <- as.integer(0.2*(0.5 + 0.25*W[,"W1"] + 0.75*W[,"W2"]) + rnorm(n) > 0) Y <- 1 + A + W[,"W1"] + 2*W[,"W2"] + rnorm(n) return(data.frame(Y,A,W)) } #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # Simulation 2: Generate a sample of size n (n=1000 by default) # W = W1,W2, bivariate normal, mu1=0.5, mu2=1, sigma = (2,1,1,1) # A = PHI(0.5 + 0.25W1 + 0.75W2), (probit model) # Y = 1 + A + W1 + 2W2 + epsilon, epsilon~N(0,1) # psi0 = 1 generate_sim2 <- function(n=1000){ sigma <- matrix(c(2, 1, 1, 1), ncol=2) W <- matrix(rnorm(n*2), ncol = nrow(sigma)) %*% chol(sigma) W <- W + matrix(rep(c(.5, 1),each=n), byrow=FALSE, ncol=2) colnames(W) <- c("W1", "W2") A <- as.integer(0.5 + 0.25*W[,"W1"] + 0.75*W[,"W2"] + rnorm(n) > 0) Y <- 1 + A + W[,"W1"] + 2*W[,"W2"] + rnorm(n) return(data.frame(Y,A,W)) } #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # Simulation 3: Generate a sample of size n, (n=200 by default) # W1 ~ N(0.5, 1), W2 ~ Bernoulli(0.5) # A ~ Bernoulli(expit(-3 - W1 + 9W2)) # Y ~ Bernoulli(expit(-1 + 5A + W1 + 10W2)) # psi0 = 0.29 generate_sim3 <- function(n=200){ W <- cbind(rnorm(n,0.5), rbinom(n, 1, 0.5)) colnames(W) <- c("W1", "W2") A <- rbinom(n, 1, plogis(-3 - W[,"W1"] + 9*W[,"W2"])) Y <- rbinom(n,1, plogis(-1 + 5*A + W[,"W1"] + 10*W[,"W2"])) return(data.frame(Y,A,W)) } #---------------------------------------------------------------------------- # Draw 250 samples to estimate Bias,Variance, MSE # Note: Calling bias.pboot with 'pboot=FALSE' returns estimates on original sample only # This code provides results with bound on g set to (0,1). Edit the next # line to obtain results with g bounded at other levels. gbounds <- c(0,1) psi0 <- 1 Q.cor <- "Y~A+W1+W2" Q.mis <- "Y~A+W1" g.cor <- "A~W1+W2" g.mis <- "A~W1" estimators <- c("Gcomp", "IPTW", "A_IPTW", "TMLE") n.iter <- 250 #--------------------------- Simulation 1 --------------------------------- # Less extreme treatment probabilities than described in Freedman&Berk, 2008. #---------------------------------------------------------------------------- sim1.Qcgc <- sim1.Qcgm <- sim1.Qmgc <- matrix(NA, nrow=n.iter, ncol=length(estimators), dimnames=list(NULL,estimators)) set.seed(39) for (i in 1:n.iter){ d.sim1 <- generate_sim1() sim1.Qcgc[i,] <- bias.pboot(d.sim1, Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE)\$est sim1.Qcgm[i,] <- bias.pboot(d.sim1, Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.mis, gbounds=gbounds, pboot=FALSE)\$est sim1.Qmgc[i,] <- bias.pboot(d.sim1, Qmethod="glm", Qform=Q.mis, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE)\$est } sim1.results <-cbind(bias.Qcgc = colMeans(sim1.Qcgc-psi0), var.Qcgc = apply(sim1.Qcgc,2, var), MSE.Qcgc = colMeans((sim1.Qcgc-psi0)^2), bias.Qcgm = colMeans(sim1.Qcgm-psi0), var.Qcgm = apply(sim1.Qcgm,2, var), MSE.Qcgm = colMeans((sim1.Qcgm-psi0)^2), bias.Qmgc = colMeans(sim1.Qmgc-psi0), var.Qmgc = apply(sim1.Qmgc,2, var), MSE.Qmgc = colMeans((sim1.Qmgc-psi0)^2)) print(round(sim1.results,3)) #--------------------------- Simulation 2 --------------------------------- # Original settings described in Freedman&Berk, 2008. #---------------------------------------------------------------------------- sim2.Qcgc <- sim2.Qcgm <- sim2.Qmgc <- matrix(NA, nrow=n.iter, ncol=length(estimators), dimnames=list(NULL,estimators)) set.seed(39) for (i in 1:n.iter){ d.sim2 <- generate_sim2() sim2.Qcgc[i,] <- bias.pboot(d.sim2, Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE)\$est sim2.Qcgm[i,] <- bias.pboot(d.sim2, Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.mis, gbounds=gbounds, pboot=FALSE)\$est sim2.Qmgc[i,] <- bias.pboot(d.sim2, Qmethod="glm", Qform=Q.mis, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE)\$est } sim2.results <- cbind(bias.Qcgc = colMeans(sim2.Qcgc-psi0), var.Qcgc = apply(sim2.Qcgc,2, var), MSE.Qcgc = colMeans((sim2.Qcgc-psi0)^2), bias.Qcgm = colMeans(sim2.Qcgm-psi0), var.Qcgm = apply(sim2.Qcgm,2, var), MSE.Qcgm = colMeans((sim2.Qcgm-psi0)^2), bias.Qmgc = colMeans(sim2.Qmgc-psi0), var.Qmgc = apply(sim2.Qmgc,2, var), MSE.Qmgc = colMeans((sim2.Qmgc-psi0)^2)) print(round(sim2.results,3)) #--------------------------- Simulation 3 --------------------------------- # Estimate with both Q and g correct, and selected data-adaptively, # forcing A into the the model for Q (Qdgd1), and # forcing A and propensity score into the model that generates parametric # bootstrap data (Qdgd2) #---------------------------------------------------------------------------- sim3.Qcgc <- sim3.Qdgd1 <- sim3.Qdgd2 <- matrix(NA, nrow=n.iter, ncol=length(estimators), dimnames=list(NULL,estimators)) psi0.3 <- 0.29 set.seed(39) for (i in 1:n.iter){ d.sim3 <- generate_sim3() sim3.Qcgc[i,] <- bias.pboot(d.sim3, gfamily="binomial", Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE, Qfamily="binomial")\$est sim3.Qdgd1[i,] <- bias.pboot(d.sim3, gfamily="binomial", Qmethod="step1", gmethod="step", gbounds=gbounds, pboot=FALSE, Qfamily="binomial")\$est sim3.Qdgd2[i,] <- bias.pboot(d.sim3, gfamily="binomial", Qmethod="step2", gmethod="step", gbounds=gbounds, pboot=FALSE, Qfamily="binomial")\$est } sim3.results <- cbind(bias.Qcgc = colMeans(sim3.Qcgc-psi0.3), var.Qcgc = apply(sim3.Qcgc,2, var), MSE.Qcgc = colMeans((sim3.Qcgc-psi0.3)^2), bias.Qdgd1 = colMeans(sim3.Qdgd1-psi0.3), var.Qdgd1 = apply(sim3.Qdgd1,2, var), MSE.Qdgd1 = colMeans((sim3.Qdgd1-psi0.3)^2), bias.Qdgd2 = colMeans(sim3.Qdgd2-psi0.3), var.Qdgd2 = apply(sim3.Qdgd2,2, var), MSE.Qdgd2 = colMeans((sim3.Qdgd2-psi0.3)^2)) print(round(sim3.results,3)) #---------------- The Parametric Bootstrap Diagnostic ----------------------- # This section applies the parametric bootstrap to N=10 samples from # simulation 2 data-generating distribution with g bounded at (0,1) # Note: Differences betweeen results obtained below and those in the paper are # due to different samples being drawn, and different non-parametric samples being # generated from one run to the next. #---------------------------------------------------------------------------- N <- 10 sim2.pbs.Qcgc <- sim2.pbs.Qcgm <- sim2.pbs.Qmgc <- matrix(NA, nrow=N, ncol=length(estimators), dimnames = list(NULL, estimators)) for (i in 1:N){ d.sim2 <- generate_sim2() sim2.pbs.Qcgc[i,] <- bias.pboot(d.sim2, Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=TRUE)\$ETA.bias sim2.pbs.Qcgm[i,] <- bias.pboot(d.sim2, Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.mis, gbounds=gbounds, pboot=TRUE)\$ETA.bias sim2.pbs.Qmgc[i,] <- bias.pboot(d.sim2, Qmethod="glm", Qform=Q.mis, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=TRUE)\$ETA.bias } sim2.pbs <- cbind(bias.Qcgc = colMeans(sim2.pbs.Qcgc), var.bias.Qcgc = apply(sim2.pbs.Qcgc,2,var), rel.bias.Qcgc = NA, bias.Qcgm = colMeans(sim2.pbs.Qcgm), var.bias.Qcgm = apply(sim2.pbs.Qcgm,2,var), rel.bias.Qcgm = NA, bias.Qmgc = colMeans(sim2.pbs.Qmgc), var.bias.Qmgc = apply(sim2.pbs.Qmgc,2,var), rel.bias.Qmgc = NA) # calculate ETA bias relative to bias of each estimator estimated above, if available if(exists("sim2.results")){ sim2.pbs[,"rel.bias.Qcgc"] <- sim2.pbs[,"bias.Qcgc"]/sim2.results[, "bias.Qcgc"] sim2.pbs[,"rel.bias.Qmgc"] <- sim2.pbs[,"bias.Qmgc"]/sim2.results[, "bias.Qmgc"] sim2.pbs[,"rel.bias.Qcgm"] <- sim2.pbs[,"bias.Qcgm"]/sim2.results[, "bias.Qcgm"] } print(round(sim2.pbs, 4)) #--------------- Non-parametric Bootstrap Variance Estimates ------------------- # Standard bootstrap approach to estimating variance, given a single dataset # find the variance of estimates on B bootstrap samples obtained by sampling # n observations with replacement from the original data # Example: simulation 2 dataset #------------------------------------------------------------------------------- B <- 1000 n <- 1000 d.sim2 <- generate_sim2(n) sim2.npbs.Qcgc <- sim2.npbs.Qcgm <- sim2.npbs.Qmgc <- matrix(NA, nrow=B, ncol=length(estimators), dimnames=list(NULL,estimators)) sim2.npbs.Qcgc <- replicate(B, bias.pboot(d.sim2[sample(1:n, replace=TRUE),], gfamily="binomial", Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE)\$est) sim2.npbs.Qcgm <- replicate(B, bias.pboot(d.sim2[sample(1:n, replace=TRUE),], gfamily="binomial", Qmethod="glm", Qform=Q.cor, gmethod="glm", gform=g.mis, gbounds=gbounds, pboot=FALSE)\$est) sim2.npbs.Qmgc <- replicate(B, bias.pboot(d.sim2[sample(1:n, replace=TRUE),], gfamily="binomial", Qmethod="glm", Qform=Q.mis, gmethod="glm", gform=g.cor, gbounds=gbounds, pboot=FALSE)\$est) sim2.SE.gbd025 <- cbind(SE.Qcgc = apply(sim2.npbs.Qcgc,1, sd), SE.Qcgm = apply(sim2.npbs.Qcgm,1, sd), SE.Qmgc = apply(sim2.npbs.Qmgc,1, sd)) round(sim2.SE.gbd025,3)