# Supplemental Materials for "Targeted Minimum Loss Based Estimation of an Intervention Specific Mean Outcome," Mark J. van der Laan and Susan Gruber, The International Journal of Biostatistics, 2012. # This program runs the simulations described in Section 5 of the paper. # Source code for the estimation procedure is available in file tmle_ISM.R, # at http://www.stat.berkeley.edu/users/laan/Software/ # Susan Gruber, sgruber@cal.berkeley.edu set.seed(10) #---------------------------------------------------------------------------- # DATA GENERATION FUNCTIONS #---------------------------------------------------------------------------- # Simulation 1: Generate a sample of size n # L0 -> A0 -> C0 -> L1 -> A1 -> C1 -> Y # we'll encode as A0 -> (1-C0) -> A1 -> (1-C1) -> Y # returns: # treat - dataset where A(k)=1 signifies subject is following the treatment # regimen of interest and is observed at node (k) # control - dataset where A(k)=1 signifies subject is following the control # regimen of interest and is observed at node (k) # psi0 - true difference in treatment-specific means # psi0.trt - true treatment-specific mean for treatment regimen # psi0.ctl - true treatment-specific mean for control regimen # Inodes, Lnodes, Ynodes - indices of columns in treat and control # corresponding to each type of node #---------------------------------------------------------------------------- generate_sim1 <- function(n=500){ W1 <- rbinom(n,1, .5) W2 <- rbinom(n, 1, .5) W3 <- rnorm(n, 4, 1) A0 <- rbinom(n, 1, .5) logitA.C0 <- .1 +.5*W1 + W2 - .1*(W3) A.C0 <- rbinom(n, 1, plogis(logitA.C0)) L1 <- 3 + A0 - .5*W1*W3 - .5*W3 + rnorm(n, 1) logitA1 <- (-1.2 - .2*W2 +.1* W3 + .4*L1) A1 <- rbinom(n, 1, plogis(logitA1)) A1[A0==0] <- 0 # no switching from ctl to treatment logitA.C1 <- 2 - .05*(W3) - .4*L1 A.C1 <- rbinom(n, 1, plogis(logitA.C1)) Y <- plogis(3 - .3*A0 - .5*A1 + .1*W2 - .5*L1 + rnorm(n)) treat <- data.frame(ID=1:n, W1, W2, W3, A0, A.C0, L1, A1, A.C1, Y) control <- data.frame(ID=1:n, W1, W2, W3, A0=(1-A0), A.C0, L1, A1=(1-A0)*(1-A1), A.C1, Y) # Impose censoring by setting subsequent nodes to 0 treat\$A.C0[treat\$A0==0] <- 0 treat\$A1[treat\$A.C0==0] <- 0 treat\$A.C1[treat\$A1==0] <- 0 treat\$Y[treat\$A.C1==0] <- 0 treat\$L1[treat\$A.C0==0] <- 0 control\$A.C0[control\$A0==0] <- 0 control\$A1[control\$A.C0==0] <- 0 control\$A.C1[control\$A1==0] <- 0 control\$Y[control\$A.C1==0] <- 0 control\$L1[control\$A.C0==0] <- 0 return(list(treat=treat, control=control, psi0=-0.1602113, psi0.trt=0.7214257, psi0.ctl=0.881637, Inodes=c(5,6,8,9), Lnodes=c(7,10), Ynodes=10)) } #---------------------------------------------------------------------------- # Simulation 2: Generate a sample of size n # L0 -> A0 -> C0 -> L1.1 -> L1.2 -> L1.3 -> C1 -> L2.1 -> L2.2 -> L2.3 -> C2 -> Y # encode as: # L0->A0->A.C0->L1.1->L1.2->L1.3->A.C1->L2.1->L2.2->L2.3->A.C2->Y # where A.C(t) = 1-C(t) # returns: # d - dataset where A(k)=1 signifies subject is following the treatment # regimen of interest and is observed at time (k) # psi0 - true treatment-specific survival probability at t_k=3 # (after third censoring event node) # Inodes, Lnodes, Ynodes - indices of columns in d corresponding to each node type #---------------------------------------------------------------------------- generate_sim2 <- function(n=500){ W1 <- rnorm(n,67,4) # baseline age W2 <- exp(rnorm(n,-1,sqrt(2))) # baseline PSA level W3 <- exp(rnorm(n,-2,1)) # baseline PSA velocity logitA0 <- 1 -.03*W1 + .1*W2 + .2*W3 + .01*W1*W2 A0 <- rbinom(n, 1, plogis(logitA0)) logitA.C0 <- .1 + .02*W1 + .1*W3+ .4*A0 A.C0 <- rbinom(n, 1, plogis(logitA.C0)) # p(observed) logitL1.1 <- -2.5 + .01*W1 + .1*W3 - .2*A0 L1.1 <- rbinom(n, 1, plogis(logitL1.1)) L1.2 <- W2 - W2*A0 + .05*log(W1) + .02*W3*W2 + rnorm(n,0,sqrt(.5)) L1.3 <- .02*W1*W2 + rnorm(n) logitA.C1 <- -.6 + A0 + .3*W2 + .1*L1.2 + .5*L1.3 A.C1 <- rbinom(n, 1, plogis(logitA.C1)) logitL2.1 <- -1 + .01*W1 + .1*L1.2 - .2*L1.3 - 1*A0 L2.1 <- rbinom(n, 1, plogis(logitL2.1)) L2.1[L1.1==1] <- 1 L2.2 <- L1.2 + .01*log(W1) + .02*L1.2*L1.3 + rnorm(n,0,sqrt(.5)) L2.3 <- .01*log(W1) + .2*L1.3 + rnorm(n) logitA.C2 <- 1.3 - .4*A0 - .2*L1.3 +.3*L2.2 - .2*L2.3 # .75, .999 A.C2 <- rbinom(n, 1, plogis(logitA.C2)) logitY <- -2 - .1*L1.3 + .7*L2.2 + .8*L2.3 -1*A0 Y <- rbinom(n, 1, plogis(logitY)) Y[L2.1==1] <- 1 d <- data.frame(ID=1:n, W1, W2, W3, A0, A.C0, L1.1, L1.2, L1.3, A.C1, L2.1, L2.2, L2.3, A.C2, Y) # Impose deterministic conventions # 1. If censored at time t, subsequent nodes set to 0 # 2. If event at time t - A=1 and Y=1 for all subsequent nodes (Y(t) already set) Inodes <- c(5,6,10,14) Lnodes <- c(7:9, 11:13, 15) Ynodes <- c(7,11,15) d[d\$A0==0,c(Inodes, Lnodes)] <- 0 d[d\$A.C0==0,c(Inodes[-(1:2)], Ynodes)] <- 0 d[d\$L1.1==1,c(Inodes[-(1:2)], Ynodes)] <- 1 d[d\$A.C1==0, c(Inodes[4], Ynodes[-1])] <- 0 d[d\$L2.1==1, c(Inodes[4], Ynodes[-1])] <- 1 d[d\$A.C2==0, Ynodes[3]] <- 0 d\$L1.2[d\$A.C0==0 | d\$L1.1==1] <- 0 d\$L1.3[d\$A.C0==0 | d\$L1.1==1] <- 0 d\$L2.2[d\$A.C1==0 | d\$L2.1==1] <- 0 d\$L2.3[d\$A.C1==0 | d\$L2.1==1] <- 0 return(list(d=d, Inodes=Inodes, Lnodes=Lnodes, Ynodes=Ynodes, psi0=.34801)) } #---------------------------------------------------------------------------- # MONTE CARLO SIMULATION FUNCTIONS #---------------------------------------------------------------------------- source("tmle_ISM.R") size <- c(100, 1000) niter <- 500 gbd <- 0 #---------------------------------------------------------------------------- # Simulation 1 # Results are saved to two separate files - one for each sample size # tmle_ISM_sim1.n100.Rdata and tmle_ISM_sim1.n1000.Rdata #---------------------------------------------------------------------------- # Define regression models for estimating Q and g factors formA0 <- "A0 ~ 1" formA.C0 <- "A.C0 ~ W1 + W2 + W3" formL1 <- "Q.kplus1 ~ I(W1*W3) + W3" formA1 <- "A1 ~ W2 + W3 + L1" formA1.mis <- "A1 ~ W1 + W2 + W3" formA.C1 <- "A.C1 ~ W3 + L1" formA.C1.mis <- "A.C1 ~ W1 + W2 + W3" formY <- "Q.kplus1 ~ W2 + L1" formY.base <- "Q.kplus1 ~ W1 + W2 + W3" formY.mis <- "Q.kplus1 ~ 1" gform.mis <- c(formA0, formA.C0, formA1.mis, formA.C1.mis) gform.cor <- c(formA0, formA.C0, formA1, formA.C1) gformList <- list(gform.cor=gform.cor, gform.mis=gform.mis) Qform.cor <- c(formL1, formY) Qform.mis1 <- rep(formY.base, 2) Qform.mis2 <- rep(formY.mis, 2) set.seed(10) for (n in size){ psi.treat <- psi.control <- matrix(NA, nrow=niter, ncol=16) colnames(psi.control) <- colnames(psi.treat) <- paste(c("iptw","iptw.n","Gcomp.Qc", "tmle.Qc", "Gcomp.Qm1", "tmle.Qm1","Gcomp.Qm2", "tmle.Qm2"), rep(c("gc", "gm"), each=8), sep=".") var.treat <- var.control <- matrix(NA, nrow=niter, ncol=6) for (i in 1:niter) { d <- generate_sim1(n) for(j in 1:length(gformList)){ est.trt.cor <- getEstimates(d\$treat, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.cor, gformList[[j]], gbd) est.trt.mis1 <- getEstimates(d\$treat, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes = d\$Ynodes, Qform.mis1, gformList[[j]], gbd) est.trt.mis2 <- getEstimates(d\$treat, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.mis2, gformList[[j]], gbd) est.ctl.cor <- getEstimates(d\$control, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.cor, gformList[[j]], gbd) est.ctl.mis1 <- getEstimates(d\$control, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.mis1, gformList[[j]], gbd) est.ctl.mis2 <- getEstimates(d\$control, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.mis2, gformList[[j]], gbd) psi.treat[i,((j-1)*8+1):(j*8)] <- c(est.trt.cor[1:4], est.trt.mis1[3:4], est.trt.mis2[2:3]) psi.control[i,((j-1)*8+1):(j*8)] <- c(est.ctl.cor[1:4], est.ctl.mis1[3:4], est.ctl.mis2[2:3]) var.treat[i,((j-1)*3+1):(j*3)] <- c(est.trt.cor[5], est.trt.mis1[5], est.trt.mis2[5]) var.control[i,((j-1)*3+1):(j*3)] <- c(est.ctl.cor[5], est.ctl.mis1[5], est.ctl.mis2[5]) }} psi0 <- d\$psi0 psi <- psi.treat - psi.control results <- cbind(est=colMeans(psi), bias=colMeans(psi-psi0), var=apply(psi, 2, var), MSE=colMeans((psi-psi0)^2)) var.psi <- var.treat + var.control print(results,2) save(results, psi, psi0, var.psi, psi.treat, psi.control, var.treat, var.control, file = paste("tmle_ISM_sim1.n",n,".Rdata", sep="")) } size <- c(100, 1000) niter <- 500 gbd <- 0 #---------------------------------------------------------------------------- # Simulation 2 # Results are saved to two separate files - one for each sample size # tmle_ISM_sim2.n100.Rdata and tmle_ISMsim2.n1000.Rdata #---------------------------------------------------------------------------- set.seed(10) # Define regression models for estimating Q and g factors formA0 <- "A0 ~ W1 + W2 + W3" formA.C0 <- "A.C0 ~ W1 + W3" formA.C0.mis <- "A.C0 ~ W1 + W2 + W3" formA.C1 <- "A.C1 ~ W2 + L1.2 + L1.3 " formA.C1.mis <- "A.C1 ~ W1 + W2 + W3 + L1.2 + L1.3" formA.C2 <- "A.C2 ~ L1.2 + L2.2 + L2.3" formA.C2.mis <- "A.C2 ~ W1 + W2 + W3 + L1.2 + L1.3 + L2.2 + L2.3" gform.cor <- c(formA0, formA.C0, formA.C1, formA.C2) gform.mis <- c(formA0, formA.C0.mis, formA.C1.mis, formA.C2.mis) gformList <- list(gform.cor=gform.cor, gform.mis=gform.mis) formY <- "Q.kplus1 ~ log(W1) + L1.3 + L2.2 + L2.3" formL2.3 <- "Q.kplus1 ~ log(W1) + L1.3" formL2.2 <- "Q.kplus1 ~ log(W1) + I(L1.2*L1.3)" formL2.1 <- "Q.kplus1 ~ W1 + L1.2 + L1.3" formL1.3 <- "Q.kplus1 ~ W1*W2" formL1.2 <- "Q.kplus1 ~ log(W1) + I(W2*W3)" formL1.1 <- "Q.kplus1 ~ W1 + W3" formY.base <- "Q.kplus1 ~ W1 + W2 + W3" form.mis <- "Q.kplus1 ~ 1" Qform.cor <- c(formL1.1, formL1.2, formL1.3, formL2.1, formL2.2, formL2.3, formY) Qform.mis1 <- c(rep(formY.base,7)) Qform.mis2 <- c(rep(form.mis, 7)) for(n in size){ psi <- matrix(NA, nrow=niter, ncol=16) colnames(psi) <- paste(c("iptw","iptw.n","Gcomp.Qc", "tmle.Qc", "Gcomp.Qm1", "tmle.Qm1", "Gcomp.Qm2", "tmle.Qm2"), rep(c("gc", "gm"), each=8), sep=".") var.psi <- matrix(NA, nrow=niter, ncol=6) for (i in 1:niter) { d <- generate_sim2(n) for(j in 1:length(gformList)){ est.cor <- getEstimates(d\$d, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.cor, gformList[[j]], gbd) est.mis1 <- getEstimates(d\$d, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.mis1, gformList[[j]], gbd) est.mis2 <- getEstimates(d\$d, Inodes=d\$Inodes, Lnodes=d\$Lnodes, Ynodes=d\$Ynodes, Qform.mis2, gformList[[j]], gbd) psi[i,((j-1)*8+1):(j*8)] <- c(est.cor[1:4], est.mis1[3:4], est.mis2[3:4]) var.psi[i,((j-1)*3+1):(j*3)] <- c(est.cor[5], est.mis1[5], est.mis2[5]) }} psi0 <- d\$psi0 results <- cbind(est=colMeans(psi), bias=colMeans(psi-psi0), var = apply(psi, 2, var), MSE = colMeans((psi-psi0)^2)) print(results,4) save(results, psi, psi0, var.psi, file = paste("tmle_ISM_sim2.n",n,".Rdata", sep="")) }