# NAME: bias.pboot # AUTHORS: Kristin Porter and Susan Gruber # (a modified version of check.ETA by Yue Wang in cvDSA package) # DATE: September 25, 2010 # SUMMARY: This function carries out a parametric bootstrap to diagnose bias in the following # estimators of the marginal additive treatment effect of a binary point treatment: # G-Computation Estimator (G-COMP), Inverse Probability of Treatment Weighted (IPTW # estimator), Augmented IPTW (A-IPTW) estimator and Targeted Maximum Likelihood Estimator (TMLE). # The diagnostic estimates bias due to (1) positivity violations and/or (2) bounding of predicted # treatment assignment probabilities. # # REFERENCE: Petersen, M.L., Porter, K.E., Gruber S., Wang, Y., van der Laan, M.J.,2010. # Diagnosing and Responding to Violations of the Positivity Assumption, Statistical Methods # in Medical Research, (forthcoming, 2010). # The file "positivity_simulations.R," available at http://www.stat.berkeley.edu/users/laan/Software/ # contains sample code to generate data and call the bias.pboot function. #---------- function bound ------------ # truncate values at (min, max) bounds. #--------------------------------------- bound <- function(x, bounds){ x[xmax(bounds)] <- max(bounds) return(x) } #----------- function tmle----------------------------------------------- # targeted maximum likelihood estimation of the additive treatment effect # based on external estimates of Q and g1W. Targeting through a logistic # fluctuation, with predicted probabilities bounded away from (0,1) at # (alpha, 1-alpha) # returns: psi - the estimate #------------------------------------------------------------------------ tmle <- function(Y,A,Q,g1W, alpha){ ab <- range(Y) Q <- bound(Q, ab) Q <- qlogis(bound((Q-ab[1])/diff(ab), c(alpha, 1-alpha))) Ystar <- (Y-ab[1])/diff(ab) h <- cbind((A/g1W - (1-A)/(1-g1W)), 1/g1W, -1/(1-g1W)) suppressWarnings(eps <- coef(glm(Ystar~-1+offset(Q[,1]) + h[,1], family="binomial"))) Qstar <- plogis(Q + eps*h) * diff(ab) + ab[1] psi <- mean(Qstar[,2]) - mean(Qstar[,3]) return(psi) } #----------- function iptw--------------------------------------------------------- # inverse probability of treatment weighting estimation of the additive treatment # effect using wts based on inverse of g1W. # returns: psi - the estimate #---------------------------------------------------------------------------------- iptw <- function(Y,A, Qfamily, wts){ if (identical(Qfamily,"binomial") | identical(Qfamily,binomial)){ EY1 <- A*wts*Y EY0 <- (1-A)*wts*Y QAW <- EY1*A + EY0*(1-A) psi <- mean(EY1) - mean(EY0) } else { m <- glm(Y~A, weights=wts, family=Qfamily) QAW <- predict(m, type="response") psi <- predict(m, newdata=data.frame(A=1), type="response") - predict(m, newdata=data.frame(A=0), type="response") } return(psi) } #----------function bias.pboot ------------------------------------------------------- # arguments # d - a dataframe with columns (in order) # Y - continuous or binary outcome # A - binary treatment indicator # W - matrix of baseline covariates # B - number of bootstrap samples; default = 1000 # gbounds - vector of length two indicating lower and upper bounds for g_n = P(A=1|W) when # fitting IPTW (not for generating bootstrap samples); default = c(0,1), no bounding # Qmethod - method for estimating Q_0 for generating bootstrap samples: # "glm" - generalized linear regression with user-supplied regression formula # "step1"= step algorithm, forcing in A only; # "step2"= step algorithm, forcing in A and propensity score # gmethod - method for estimating g_0 for generating bootstrap samples: "glm" or "step" # Qform - regression formula for glm estimation of Q_n for generating bs samples if Qmethod="glm" # gform - regression formula for glm estimation of g_n for generating bs samples if gmethod="glm" # gfamily - family for estimating g_0; default = "binomial"(link="probit") # Qfamily - family for estimating Q_0; default = "gaussian" # pboot - logical flag, default=TRUE to obtain estimates on original sample and run parametric # bootstraps. Set to FALSE to obtain estimates from each estimator in original sample only # returns # est - estimate obtained by applying each estimator to original sample: Gcomp IPTW A-IPTW TMLE # est.bs - mean estimate from applying each estimator to B bootstrap samples # bias - non-p bootrap bias of each estimator # relbias - non-p bootstrap bias relative to Gcomp # bs.results - 4xB matrix of estimates on all bootstrap samples #-------------------------------------------------------------------------------- bias.pboot <- function(d, B=1000, gbounds=c(0,1), Qmethod, gmethod, Qform=NULL, gform=NULL, gfamily="binomial"(link="probit"), Qfamily="gaussian", pboot=TRUE) { #----------function p.boot ------------------------------------------------------- # parametric bootstrap, called by bias.pboot function # - generate data based on given models for Q and g # - obtain parameter estimates of the generated data using the procedures # specified by "Qmethod" and "gmethod" arguments # arguments: # Q.model - parametric model to use to generate values for Y given A,W # g.model - parametric model to use to generate g1W # gbounds - bounds on predicted probabilities P(A=1|W) # W - baseline covariates # returns: # estimates of the additive treatment effect for Gcomp, IPTW, A-IPTW, and TMLE. #-------------------------------------------------------------------------------- p.boot <- function(Q.model, g.model, W){ generate_data <- function(){ g1W <- predict(g.model, newdata = data.frame(W[bs,]),type="response") A <- rbinom(n,1,g1W) Y <- predict(Q.model,newdata=data.frame(A, W[bs,], g1W),type="response")+ rnorm(n) return(data.frame(Y,A,W[bs,])) } n <- nrow(W) bs <- sample(1:n, n, replace=TRUE) df <- generate_data() if(Qmethod=="glm"){ m <- glm(Q.model$formula, data=df, family=Qfamily) } else { upper.Q <- paste("~A+", paste(colnames(df)[-(1:2)],collapse="+")) m <- glm(Y~A, data=df, family=Qfamily) m <- step(m,scope=list(upper=upper.Q, lower=~A), trace=0, direction="forward", data=df) } QAW <- predict(m) Q1W <- predict(m, newdata=data.frame(df[,-2], A=1)) Q0W <- predict(m, newdata=data.frame(df[,-2], A=0)) if(gmethod=="glm"){ g <- glm(g.model$formula, data=df, family=gfamily) } else { g <- glm(A~1, data=df, family=gfamily) upper.g <- paste("~",paste(colnames(df)[-(1:2)], collapse="+")) g <- step(g, scope=list(upper=upper.g, lower=~1), trace=0, direction="forward", data=df[,-1]) } g1W <- bound(predict(g, type="response"), gbounds) wts <- 1/g1W wts[df$A==0] <- 1/(1-g1W[df$A==0]) psi.IPTW <- iptw(df$Y,df$A, Qfamily, wts)$psi psi.tmle <- tmle(df$Y,df$A, Q=cbind(QAW, Q1W, Q0W), g1W=g1W,alpha=0.999999)$psi psi.Gcomp <- mean(Q1W)-mean(Q0W) psi.A_IPTW <- mean((df$A - (1-df$A))* wts * (df$Y - QAW) + Q1W - Q0W) return(c(Gcomp=psi.Gcomp, IPTW = psi.IPTW, A_IPTW=psi.A_IPTW, tmle=psi.tmle)) } # checks and initializations if (!(Qmethod %in% c("glm", "step1", "step2"))){ stop("\nWARNING: Estimation method for Q must be 'glm', 'step1', or 'step2'\n") } if (!(gmethod %in% c("glm", "step"))){ stop("\nWARNING: Estimation method for g must be 'glm', 'step'\n") } if (length(gbounds)==1){gbounds <- c(gbounds, 1-gbounds)} if(min(gbounds) < 0 | max(gbounds > 1)){ stop("\nWARNING: Bounds on g must be between 0 and 1\n") } # Estimate g(1|W) if(gmethod=="glm"){ g <- glm(gform, data=d, family=gfamily) } else { g <- glm(A~1, , data=d, family=gfamily) upper.g <- paste("~",paste(colnames(d)[-(1:2)], collapse="+")) g <- step(g, scope=list(upper=upper.g, lower=~1), trace=0, direction="forward") } g1W <- bound(predict(g, type="response"), gbounds) wts <- 1/g1W wts[d$A==0] <- 1/(1-g1W[d$A==0]) # Estimate Q on original sample if(Qmethod=="glm"){ m <- glm(Qform, data=d, family=Qfamily) } else { Qform <- "Y~A" lower.Q <- "~A" if (Qmethod=="step2" & length(unique(g1W))>1){ Qform <- "Y~A+g1W" lower.Q <- "~A+g1W" } upper.Q <- paste(lower.Q, "+", paste(colnames(d)[-(1:2)],collapse="+"), collapse="") m <- glm(as.formula(Qform), data=data.frame(d, g1W), family=Qfamily) m <- step(m,scope=list(upper=upper.Q, lower=lower.Q), trace=0, direction="forward") } QAW <- predict(m, type="response") Q1W <- predict(m, newdata=data.frame(d[,-2], A=1), type="response") Q0W <- predict(m, newdata=data.frame(d[,-2], A=0), type="response") # Get estimates on original sample psi.Gcomp <- mean(Q1W-Q0W) psi.IPTW <- iptw(d$Y,d$A,Qfamily, wts) psi.tmle <- tmle(d$Y,d$A, Q=cbind(QAW, Q1W, Q0W), g1W=g1W, alpha=0.999999) psi.A_IPTW <- mean((d$A - (1-d$A))* wts * (d$Y - QAW) + Q1W - Q0W) # call pboot function to get estimates on B bootstrap samples if(pboot){ bs.results <- replicate(B, p.boot(Q.model=m, g.model=g, W=d[,-(1:2), drop=FALSE])) est.bs <- rowMeans(bs.results) ETA.bias <- rowMeans(bs.results-psi.Gcomp) } else { bs.results <- est.bs <- ETA.bias <- NULL } return(list(est=c(Gcomp=psi.Gcomp, IPTW=psi.IPTW, A_IPTW=psi.A_IPTW,TMLE=psi.tmle), est.bs = est.bs, ETA.bias=ETA.bias, bs.results=bs.results)) }