# Collaborative Targeted Maximum Likelihood Estimation (ctmle) # Susan Gruber (sgruber@cal.berkeley.edu) # Mark van der Laan (laan@berkeley.edu) # September 15, 2010 # Estimates EY1 parameter (missingness only, no treatment) # Dec 7, 2011 # removed projection onto A in stage 1 # also changed cv pen from var(IC) to var(Dstar) # updated Feb 15, 2011 # ICg calculation fixed # The DSA and SuperLearner packages are strongly recommended in order to # obtain reliable results. # # SuperLearner is available at http://www.stat.berkeley.edu/users/ecpolley/SL/ # DSA: http://www.stat.berkeley.edu/users/laan/Software/ # These packages depend on # modelUtils: http://www.stat.berkeley.edu/users/laan/Software/ # nnls: http://cran.r-project.org/web/packages/nnls/index.html # quadprog: http://cran.r-project.org/web/packages/quadprog/index.html #----------- Description ----------- # This R source code implements C-TMLE, originally # described in Gruber S., van der Laan J., "An Application of Collaborative # Targeted Maximum Likelihood Estimation in Causal Inference and Genomics," # The International Journal of Biostatistics,(6)1, 2010. # # for EY1 parameter - no treatment! #------------------------------------------------------------------------------------ # Print and summary functions for ctmle objects summary.ctmle <- function(object,...){ if(identical(class(object), "ctmle")){ d <- ncand <- terms <- covar <- NULL if(!is.null(object$candidates)){ ncand <- sum(!is.na(object$candidates$candidates$terms)) terms <- c(object$candidates$candidates$terms[-1]) covar <- object$candidates$candidates$covar[1:ncand] d <- data.frame(c("(intercept)",object$candidates$candidates$terms[-1])[1:ncand], object$candidates$candidates$covar[1:ncand], object$candidates$results.all$est[1:ncand], object$cv.res$likelihood[1:ncand], # this is how the thing was selected. object$cv.res$varIC[1:ncand], object$cv.res$penlikelihood[1:ncand]) dimnames(d) <- list(paste("cand", 1:ncand), c("term added", "cleverCovar","estimate", "cv-RSS", "cv-varIC", "cv-penRSS")) } names(object$best_k) <- names(object$est) <- NULL summary.ctmle <- list(est=object$est, var=object$var.psi, CI=object$CI,pvalue=object$pvalue, d=d, selected=object$best_k, ncand=ncand, terms=terms, covar=covar) } else { summary.ctmle <- NULL } class(summary.ctmle) <- "summary.ctmle" return(summary.ctmle) } print.summary.ctmle <- function(x,...){ if(identical(class(x), "summary.ctmle")){ if(!is.null(x$d)){ npercovar <- table(x$covar[1:x$ncand]) suffix <- paste(x$covar[1:x$ncand], letters[unlist(apply(npercovar, 1,function(x){1:x}))], sep="") suffix[cumsum(npercovar)[npercovar==1]] <- names(cumsum(npercovar)[npercovar==1]) prev_covar <-suffix[cumsum(npercovar[-length(npercovar)])] prev_moves <- c(paste(" + epsilon", prev_covar, " * h", prev_covar, sep=""),"") current_move <- paste(" + epsilon", suffix, " * h", suffix,sep="") TMLEcand <- paste("\tcand ", 1:x$ncand,": Q",1:x$ncand,"(W) =", sep="") fluctuations <- paste("Q0(W)", c(rep("", npercovar[1]), mapply(function(x){paste(prev_moves[1:(x-1)],collapse="")},x$covar[-(1:npercovar[1])])), current_move, sep="") final_update <- c(rep("\n", npercovar[1]), paste("\t = Q", rep(cumsum(npercovar[-length(npercovar)]), times=npercovar[-1]),"(W)", current_move[(npercovar[1]+1) : x$ncand],",\n", sep="")) tx <- c("\t\t\th1a is based on an intercept-only model for missingness mechanism g(W)\n\n", paste("\t\t\th", suffix[-1], " is based on a missingness mechanism model containing covariates ", mapply(function(x1){paste(x$terms[1:x1], collapse=", ")}, 1:(x$ncand-1)), "\n\n", sep="")) cat("\nNumber of candidate TMLE estimators created: ", x$ncand, "\n") cat("A candidate TMLE estimator was created at each move, as each new term\nwas incorporated into the model for g.\n") cat(paste(rep("-",70), collapse=""), "\n") print(x$d, digits=3) cat(paste(rep("-",70), collapse=""), "\n") cat("Selected TMLE estimator is candidate", x$selected,"\n\n") cat("Each TMLE candidate was created by fluctuating the initial fit, Q0(W)=E[Y|Delta=1,W], obtained in stage 1.\n\n") cat(paste(TMLEcand, fluctuations, final_update, tx)) } cat(paste(rep("-",10), collapse=""), "\n") cat("C-TMLE result:\n") cat("\tparameter estimate: ", round(x$est,5), "\n") cat("\testimated variance: ", round(x$var,5), "\n") cat("\t p-value: ", ifelse(x$pvalue <= 2*10^-16, "<2e-16",signif(x$pvalue,5)), "\n") cat("\t 95% conf interval:", paste("(", round(x$CI[1],5), ", ", round(x$CI[2],5), ")", sep=""),"\n") } } #-------------print.ctmle------------------ # print object returned by ctmle function #----------------------------------------- print.ctmle <- function(x,...){ if(identical(class(x), "ctmle")){ cat("C-TMLE result:\n") cat("\tparameter estimate: ", round(x$est,5), "\n") cat("\testimated variance: ", round(x$var.psi,5), "\n") cat("\t p-value: ", ifelse(x$pvalue <= 2*10^-16, "<2e-16",signif(x$pvalue,5)), "\n") cat("\t 95% conf interval:", paste("(", round(x$CI[1],5), ", ", round(x$CI[2],5), ")", sep=""),"\n") } else { stop("Error calling print.ctmle. 'x' needs to have class 'ctmle'") } } #---------- function bound --------------- # set outliers to min/max allowable values # assumes x contains only numerical data #----------------------------------------- bound <- function(x, bounds){ x[xmax(bounds)] <- max(bounds) return(x) } #---------- function Tx_mech_EY1 ------- # purpose: calculate h and g1W given a model for g and # treatment indicator, A. # arguments: # g - glm (or other object) with a prediction method # for P(A=1|W) # X - design matrix, A in column 1, plus covariates W # gbound - bound for g # returns: # h - a vector = 1/pDelta=1|W # g1W - P(A=1, W) #---------------------------------------- Tx_mech_EY1 <- function (m.Delta,X, gbound) { pDelta1<- bound(predict(m.Delta, newdata=X, type="response"), c(gbound, 1)) h <- 1/pDelta1 return(list(h=h, g1W=pDelta1)) } #----------------function calc_varIC_EY1-------------- # purpose: calculate the variance of the IC, # arguments: # Y - outcome # Q - response, not linear predictors # h - clever covariate # A - target var # W - only the terms in g # g1W - p(A=1|W) # (Inference should take ICg term into account) # returns: variance of the influence curve #------------------------------------------------- calc_varIC_EY1 <- function(Y, Q, h, Delta, W=NULL, g1W=NULL, ICg=FALSE) { Y[Delta==0] <- 0 Dstar <- Delta*(Y-Q) * h + Q - mean(Q) varDstar <- var(Dstar) varIC <- varDstar if(ICg & !is.null(W) & !is.null(Delta)){ if(NCOL(W) > 0){ W <- cbind(rep(1,nrow(as.matrix(W))), W) term1 <-colMeans( -(Y-Q) * W * Delta * (1-g1W)/g1W ) term2 <- try(solve(matrix(rowMeans(apply(cbind(g1W, W), 1, function(v){v[-1] %o% v[-1] * v[1] * (1-v[1])})),byrow=TRUE, nrow=ncol(W))), silent=TRUE) if(is.matrix(term2)){ IC <- as.vector(Dstar + term1 %*% term2 %*% t((Delta-g1W) * W)) varIC <- var(IC) } } } return(c(varDstar, varIC)) } #--------------------function rankCovars---------------- # univariate regression of outcome (or optionally, residuals) on Ws # Y - outcome # W - covariate matrix # QAW.lp - linear predictors to use for offset # family # type - "raw", don't adjust, otherwise adjust using Benjamini-Hochberg procedure # returns a list, each element ordered by significance, most sig first. # cnames is the column names # index is the ordered column numbers # p is a vector of the (optionally FDR-adjusted) p-values #-------------------------------------------------- .rankCovars <- function (Y,W, QAW.lp=rep(0, length(Y)), family="gaussian",type="raw"){ W <- as.matrix(W) n <- NCOL(W) cnames <- colnames(W) o <- 1:n p <- rep(NA, n) for (i in 1:n){ suppressWarnings(m <- glm(Y~W[,i]+offset(QAW.lp), family=family)) if(nrow(summary(m)$coef) > 2) { p[i] <- min(summary(m)$coef[-1,4]) #factor } else if(nrow(summary(m)$coef) ==2 & ncol(summary(m)$coef)==4) { p[i] <- summary(m)$coef[2,4] } else if(nrow(summary(m)$coef) ==1 & ncol(summary(m)$coef)==4) { p[i] <- summary(m)$coef[1,4] } else { p[i] <- Inf } } if (type !="raw") { adjp <- .BH(p) } else { adjp <- p } o <- order(adjp) return(list(cnames=cnames[o], index=o, p=adjp[o])) } #----- function enrich_W ----- # purpose: create a dataset of potential confounders from W # by carrying out dimension reduction and creation of second order terms # select covariates that are predictive of residuals (Y-initialQ) # check for significant 2nd order terms. # arguments: # Y,A,W - the data # QAW - fitted values from initial (stage 1) estimate of Q # train - logical indicator for obs in training set # Qfamly - regression family # returns: # enriched covariate set, retaining at least 2 covariates # ------------------------------------ enrich_W <- function (Y,Delta,W,QAW, family="gaussian",enrich, reduce, verbose) { p_enriched <- .1 newnames <- cnames <- colnames(W) W_enriched <- as.matrix(W) # create additional second order terms if(enrich){ if(ncol(W_enriched)>1){ for (i in 1:(ncol(W)-1)){ W_enriched <- cbind(W_enriched, W[,i] * W[,-(1:i)]) newnames <- c(newnames, paste(cnames[i],cnames[-(1:i)], sep="")) }} W_enriched <- cbind(W_enriched, W^2) colnames(W_enriched) <- c(newnames, paste(colnames(W),colnames(W),sep="")) W_enriched <- t(unique(t(W_enriched))) } # dimension reduction if(reduce){ if(NCOL(W_enriched) > 2) { covars_enriched <- .rankCovars(Y,W_enriched,QAW, family=family,type="raw") keep <- covars_enriched$index[covars_enriched$p <= max(p_enriched, covars_enriched$p[2])] W_enriched <- as.matrix(W_enriched[,keep]) } } if(verbose){cat("\tSelected set of covariates:", colnames(W_enriched), "\n")} return(W_enriched) } #--------------function select_terms---------- # purpose: add terms to the current model for Delta # as long as they continue to increase the # (penalized) likelihood # arguments: # Y - outcome # X - design matrix, Delta in column 1 # g.dataset - dataset containing A and W for all observations # Q: Q1W # family - binomial for binary outcomes, gaussian for continuous outcomes # Qbounds - respect constraints on Y # g.Deltaform - starting formula for regression of Delta on W # (e.g. terms forced in from previous iterations) # terms_remaining - terms to consider adding to the model for g # nterms - bound on number of terms to add # training_set - identifies which values of Tx$h correspond to # the observations in Y and X (since g is fit on all obs). # gbound - bounds for predicted values of g #-------------------------------------------------- select_terms <- function(Y,X,g.dataset=X, Q, family, g.Deltaform, terms_remaining, nterms, training_set, covar, like_type, gbound, verbose=TRUE) { Y[X[,"Delta"] == 0] <- 0 # overwrite NAs bestScore <- Inf improving <- TRUE DONE <- length(terms_remaining) == 0 allInf <- FALSE counter <- 0 epsilon <- NULL # first covar- special check for intercept-only model if(covar==1){ g <- glm(g.Deltaform,data=g.dataset, family=binomial) Tx <- Tx_mech_EY1(g, g.dataset[training_set,], gbound) epsilon <- suppressWarnings(coef(glm(Y~-1+offset(Q)+Tx$h, family=family, subset=X$Delta==1))) Qstar <- Q + epsilon * Tx$h if(family=="binomial"){ Qstar <- plogis(Qstar) } varIC <- calc_varIC_EY1(Y, Qstar, Tx$h, X[,"Delta"])[1] * sum(training_set)/length(training_set) if(like_type == "RSS"){ bestScore <- sum(X[,"Delta"]*(Y - Qstar)^2) + varIC[1] } else { bestScore <- -sum(X[,"Delta"]*(Y*log(Qstar) + (1-Y)*log(1-Qstar))) + varIC[1] } if(verbose){cat("\n\t\t",paste("penalized", like_type, "(intercept only):"), round(bestScore,5), "\n")} } while (!DONE) { counter <- counter +1 score <- rep(NA, length(terms_remaining)) eps_try <- rep(NA, length(terms_remaining)) if(verbose){ cat("\n\t Selecting best term to add to model...\n") } for (i in 1:length(terms_remaining)){ g.Deltaform_try <- paste(g.Deltaform, "+",terms_remaining[i]) if(verbose) { cat("\t\tTrying", g.Deltaform_try,"\n") } g <- glm(g.Deltaform_try,data=g.dataset, family=binomial) Tx <- Tx_mech_EY1(g, g.dataset[training_set,], gbound) eps_try[i] <- suppressWarnings(coef(glm(Y~-1+offset(Q)+Tx$h, family=family, subset=X$Delta==1))) Qstar <- Q + eps_try[i] * Tx$h if(family=="binomial"){ Qstar <- plogis(Qstar) } varIC <- calc_varIC_EY1(Y, Qstar, Tx$h, Delta=X[,"Delta"])[1] * sum(training_set)/length(training_set) if(like_type == "RSS"){ score[i] <- sum(X[,"Delta"]*(Y - Qstar)^2) + varIC[1] } else { score[i] <- -sum(X[,"Delta"]*(Y*log(Qstar) + (1-Y)*log(1-Qstar))) + varIC[1] } } if(verbose) {cat("\t\t",paste("penalized", like_type,":"), round(score,5), "\n")} score[is.nan(score)] <- Inf best <- which.min(abs(score)) if(verbose) {cat("\t\tbest choice:", terms_remaining[best], "\n")} if (length(best)==0) { allInf <- TRUE improving <- FALSE # next lines assure an assignment later doesn't crash best <- 1 score[best] <- Inf eps_try[best] <- 0 } else if (is.infinite(score[best])) { allInf <- TRUE improving <- FALSE } else { improving <- (bestScore[length(bestScore)]>score[best] ) } prevg.Deltaform <- g.Deltaform g.Deltaform <- paste(g.Deltaform,"+", terms_remaining[best]) bestScore <- c(bestScore, score[best]) epsilon <- c(epsilon, eps_try[best]) terms_remaining <- terms_remaining[-best] DONE <- !improving |length(terms_remaining) == 0 | nterms == counter } if(counter==1 & allInf) { if(length(terms_remaining)>1){ g.Deltaform <- paste(g.Deltaform," + ", paste(terms_remaining[1:(nterms-counter)], collapse=" + "), sep="") } epsilon <- c(epsilon, rep(0,(nterms-counter))) bestScore <- Inf } else if (!improving & (counter > 1 | allInf) ){ if(verbose){cat("\t\t(However, no additional term improves the penalized", like_type, ", so none of these\n\t\twill be included in the model for the current clever covariate.)\n\n")} g.Deltaform <- prevg.Deltaform epsilon <- epsilon[-length(epsilon)] bestScore <- bestScore[length(bestScore)-1] } else { bestScore <- bestScore[length(bestScore)] } return(list(g.Deltaform=g.Deltaform, epsilon=epsilon, score=bestScore)) } #-------------function evaluate_candidates------- # purpose: calculate RSS, var, estimate for each candidate estimator # adds Gcomp on initial as 1st estimator, so k=2 corresponds # to tmle estimator using first g-hat to calculcate h. # arguments: # Y - outcome var # X - design matrix (when called from cv, this should be test set observations) # Q - nx2 matrix of values corresponding to initial Q[QAW, Q1W] # family - binomial for binary outcomes, gaussian for continuous outcomes # Qbounds - respect constraints on Y # g.dataset - typically all observations, used to fit g # candidates: a list with one entry for each candidate estimator # term - term name, in order of incorporation into clever covariates # covar - a number indicating which clever covariate the term is a member of # epsilon - epsilon corresponding to each clever covar # ncandidates - number of candidates to evaluate # like_type "RSS" or "loglike" # gbound - bound on predicted value for g # training_set - obs for model fits # returns: likelihood, var(Dstar+IC_g), estimate for each candidate # note: each term "within" the same clever covar has the same offset, # but its epsilon will be different #------------------------------------------------- evaluate_candidates <- function(Y,X,Q, family, g.dataset, ab, candidates, ncandidates, like_type, gbound,training_set=1:length(Y)) { Y[X[,"Delta"]==0] <- 0 #---------------function which.cols---------------- # given a vector of column names and a formula # return indices of the names of terms on rhs of formula #-------------------------------------------------- which.cols <- function(cnames, form){ fterms <- attr(terms(as.formula(form)),"term.labels") which(cnames %in% fterms) } Qinit <- Q # don't need this? test_set <- !training_set | all(training_set) g.Deltaform = "Delta~1" nterms <- sum(!is.na(candidates$terms)) cum_nterms <- cumsum(table(candidates$covar)) ncovar <- length(candidates$epsilon) est <- likelihood <- varIC <- varDstar <- rep(Inf, ncandidates+1) covar <- 1 for (i in 1:nterms){ g.Deltaform <- paste(g.Deltaform, "+",candidates$terms[i], sep="") g <- glm(g.Deltaform, data=g.dataset, family=binomial) #fit g on all obs Tx <- Tx_mech_EY1(g, g.dataset, gbound) Qstar <- Q + candidates$epsilon[i] * Tx$h if(family=="binomial"){ Qstar <- plogis(Qstar) } # track est on training_set to use for bias calculation in cv function est[i] <- (mean(Qstar[training_set]))*diff(ab) + ab[1] if(like_type=="RSS") { likelihood[i] <- sum(X[test_set,"Delta"]*(Y[test_set]-Qstar[test_set])^2) } else { likelihood[i] <- -sum(X[test_set,"Delta"]*(Y[test_set]*log(Qstar[test_set]) + (1-Y[test_set])*log(1-Qstar[test_set]))) } temp <- calc_varIC_EY1(Y[test_set], Q=Qstar[test_set], h=Tx$h[test_set], Delta=X[test_set,1], W=X[test_set, which.cols(colnames(X), g.Deltaform)], g1W=Tx$g1W[test_set], ICg=TRUE) varDstar[i] <- temp[1] varIC[i] <- temp[2] if(is.nan(est[i])|is.infinite(est[i])){est[i] <- NA} if(is.nan(likelihood[i])|is.infinite(likelihood[i])){likelihood[i] <- NA} if(is.nan(varIC[i])|is.infinite(varIC[i])){varIC[i] <- NA} if(is.nan(varDstar[i])|is.infinite(varDstar[i])){varDstar[i] <- NA} #jump to next clever covariate at pre-determined model sizes if (i == cum_nterms[covar]) { covar <- covar+1 Q <- Q + candidates$epsilon[i] * Tx$h } } # this estimate is on the original scale, but varIC and varDstar are not return(list(est=est, likelihood=likelihood, varDstar=varDstar, varIC=varIC)) } #--------------function construct_candidates------ # purpose - build increasingly non-p candidate g-hats ## construct clever covariates using forward selection # for each clever covar add 1 or more variables to terms in the previous clever covar, # until no improvement in RSS. Iterate until all terms have been added # For stability, calculate g based on data from full sample. # arguments: # Y - outcome vector # X - design matrix (A in first column) # Q - stage 1 estimate of Q # family - binomial for binary outcomes, gaussian for continuous outcomes # ncandidates - max # candidates to construct - i think ignored for now. # terms - names of covariates to consider adding to model for g # training_set - indicates which obs in Tx$h correspond to obs in Y, X # gbound - bounds for predicted values for g1W # like_type - RSS or loglike # return: # terms - a vector containing the covariates in the order that they were added # to the model for g, # covar - which clever covariate each term is in, # epsilon - corresponding epsilon values. #-------------------------------------------------- construct_candidates <- function(Y, X, Q, family, ncandidates=ncol(X)-1,terms,training_set, gbound, like_type, verbose=FALSE){ g.dataset <- X terms_remaining <- terms nterms <- length(terms_remaining) g.Deltaform <- "Delta~1" epsilon <- NULL covar <- NULL i <- 0 DONE <- nterms <= 0 while(!DONE) { i <- i+1 if(verbose) {cat("\tBeginning construction of clever covariate", i,"\n")} nextCandidates <- select_terms(Y[training_set],X[training_set,], g.dataset, Q[training_set], family, g.Deltaform, terms_remaining, nterms=length(terms_remaining), training_set, covar=i, like_type=like_type, gbound=gbound, verbose=verbose) newg.Deltaform <- nextCandidates$g.Deltaform epsilon <- c(epsilon, nextCandidates$epsilon) newterms <- setdiff(attr(terms(as.formula(newg.Deltaform)),"term.labels"), attr(terms(as.formula(g.Deltaform)),"term.labels")) if(i==1) {covar <- 1} terms_remaining <- terms_remaining[-which(terms_remaining %in% newterms)] covar <- c(covar, rep(i, length(newterms))) g <- glm(newg.Deltaform, data=g.dataset, family="binomial") Tx <- Tx_mech_EY1(g, g.dataset[training_set,], gbound) Q[training_set] <- Q[training_set] + epsilon[length(epsilon)] * Tx$h g.Deltaform <- newg.Deltaform DONE <- length(terms_remaining)==0 | ncandidates <= length(covar)-1 if(verbose){ cat("\tThe model for clever covariate", i, "is complete.\n") cat("\tConstructed regression equation for estimating g(W)=p(Delta=1|W):\n\t\t\t", g.Deltaform, "\n\n") cat("\t\t...Calculating h(A,W) based on candidate g-estimator", i,"\n") cat("\t\t...Running a logistic regression to fit epsilon\n") cat("\t\t...Updating estimate of Q(W) = Q(W) + epsilon * h(W)\n") if(DONE){ cat("\tAll candidate TMLE estimators have been constructed\n\n") } else { cat("\n\tReady to use the updated estimate of Q(W) to construct the next clever covariate.\n") cat("\t(The base model for clever covariate", i+1, "contains all terms included in clever covariate", i,")\n\n") } } } terms=c(attr(terms(as.formula(g.Deltaform)), "term.labels")) end <- min(length(terms), ncandidates)+1 return(list(terms=c('1',terms[1:(end-1)]), covar=covar[1:end], epsilon=epsilon[1:end])) } #----- function estQ_cvSL ---- # purpose: Obtain cross-validated estimates for initial Q using discrete SL. # This function will return cross-validated predicted initial values # corresponding to the best algorithm in the library, or the convex combination # calculated by SL itself, as chosen by cvRSS. # The fitted value for each observation i, is predicted from a fit based on a training set # excluding the fold containing observation i. # arguments: # Y - outcome # X - design matrix with A in first column # SL.library - prediction algorithms # V - number of outer cv-folds for disscrete SL # V_SL - number of folds for internal SL cross-validation # family - binomial or gaussian # returns: # Q - nx3 matrix of predicted values on linear scale (e.g. logit if Qfamily=binomial) #----------------------------------------------- estQ_cvSL.nomissing <- function(Y,Delta, X,newX=X,SL.library, V=5, V_SL=5, family="gaussian", Qbounds, verbose){ Q <- cvRSS <- NULL n <- length(Y) fold <- by(sample(1:n),rep(1:V, length.out=n),function(x){x}) n_predictors <-length(SL.library) if(verbose){ cat("\tStage 1: Estimating cross-validated initial Q with Super Learner\n") } # We'll create a matrix of predictions - one column for each predictor in the library # plus one more for SL itself, with 3*n predicted values per column, corresponding to newX. predictions <- matrix(nrow=3*n, ncol=length(SL.library)+1) m_SL <- NULL for (v in 1:V) { fold_rows <- fold[[v]] if(class(m_SL) != "try-error"){ m_SL <- try(SuperLearner(Y=Y[-fold[[v]]][Delta[-fold[[v]]]==1], X=X[-fold[[v]],][Delta[-fold[[v]]]==1,], newX=newX[fold_rows,], V=V_SL, save.fit.library=FALSE, family=family,SL.library=SL.library)) } if(class(m_SL) != "try-error"){ predictions[fold_rows,1] <- m_SL$SL.predict for (s in 1:n_predictors){ predictions[fold_rows,s+1] <- m_SL$library.predict[,s] } predictions <- bound(predictions, Qbounds) } } cvRSS <- colSums(Delta*(Y-predictions[1:n])^2, na.rm=TRUE) best <- which.min(cvRSS) best_alg <- c("SL", SL.library)[best] Q <- predictions[,best] if(verbose){cat("\tDiscrete SL: best algorithm = ", best_alg,"\n")} if (is.null(Q) | class(m_SL) == "try-error"){ Q <- 0 class(Q) <- "try-error" } return(Q) } #--------------function stage1---------------- # purpose: Use Superlearner to get the stage 1 estimate of the density # Note: SL library should include many more prediction algorithms in practice # arguments: # Y - outcome var, only include obs where Y and A both observe # X - design matrix used to fit the models, A in first column # Qform - optional regression formula to predict Q # family # Qbounds - bound on initial Y and predicted values for Q. # Qinit - optional user-supplied values for $Q$ # alpha: if we're doing the Y* transformation, keep Ystar values away from (0,1) # SL.library - only used if estimating Q with SL # maptoYstar - boolean - whether or not to map to Ystar and fluctuate on logistic scale # returns: # Q - predicted values # Note: values in Q are on the logit scale for binary outcomes, Y #-------------------------------------------------- stage1_EY1 <- function(Y,X, Qform=NULL, family, Qbounds,Qinit, alpha=.995, SL.library, cvQinit, maptoYstar=FALSE,verbose=FALSE){ ab <- c(0,1) if(family == "binomial"){ Qbounds <- sort(c(1-alpha, alpha)) } else { if (is.null(Qbounds)){ Qbounds <- range(Y) } if(verbose){ cat("\tTruncating Y at (", paste(round(Qbounds,3), collapse=", "), ")\n\n") } Y <- bound(Y, Qbounds) if(!is.null(Qinit)){ Qinit <- bound(Qinit, Qbounds) } } if(maptoYstar) { ab <- range(Y[X[,"Delta"]==1]) Y <- (Y-ab[1])/diff(ab) if(!is.null(Qinit)){ Qinit <- (Qinit-ab[1])/diff(ab) } Qbounds <- sort(c(1-alpha, alpha)) if(verbose){ cat("\tMapping Y to Y* (a=", round(ab[1],3), ", b=", round(ab[2],3),") \n\n") } } if(is.null(Qinit)){ if(verbose){ cat("\t\t-----Stage 1: Estimating the conditional mean of Y given W-----\n\n") } n <- length(Y) Q_SL <- NULL X1 <- data.frame(X[,-1]) colnames(X1) <- colnames(X)[-1] if(is.null(Qform)){ if (require(SuperLearner)) { if(is.null(SL.library)){ SL.library <- c("SL.glm", "SL.step", "SL.DSA.2") } if(cvQinit){ Q_SL <- try(estQ_cvSL.nomissing(Y, Delta=X[,1], X[,-1], newX=X[,-1], SL.library, family=family, Qbounds=Qbounds, verbose=verbose)) if(class(Q_SL) != "try-error"){ Q <- Q_SL } } else { if(verbose){ cat("\tStage 1: Estimating initial Q with Super Learner\n") } Q_SL <- try(SuperLearner(Y=Y[X[,"Delta"]==1], X = X[X[,"Delta"]==1,-1], newX=X, SL.library=SL.library, save.fit.library=FALSE, family=family)) if(identical(class(Q_SL), "SuperLearner")){ Q <- Q_SL$SL.predict } } } if(identical(class(Q_SL), "try-error") | identical(class(Q_SL), "NULL")) { if(verbose){ cat("\tSuper Learner prediction not available. Using main terms linear regression to estimate initial Q\n") } Qform <- paste ("Y~1+", paste(colnames(X), collapse="+")) m <- glm(as.formula(Qform), data=data.frame(Y,X), family=family, subset=Delta==1) Q <- predict(m, newdata=data.frame(X), type="response") } } else { if(verbose){cat("\tEstimating initial Q using user-supplied regression formula\n")} m <- glm(as.formula(Qform), data=data.frame(Y,X), family=family, subset=Delta==1) Q <- predict(m, newdata=data.frame(X), type="response") } } else { Q <- bound(Qinit,Qbounds) } if (family=="binomial" |maptoYstar){ Q <- qlogis(bound(Q, Qbounds)) if(verbose){ cat("\n\tBounding predicted values for Q away from (0,1) at (", paste(round(Qbounds,3), collapse=", "),")\n\n") } } else { Q <- bound(Q, Qbounds) } family.stage2 <- c(ifelse(maptoYstar, "binomial", family), family) # project stage 1 estimate onto betaA + offset(QAW) #coefA <- suppressWarnings(coef(glm(Y~-1+X[,"A"]+offset(Q[,"QAW"]), family=family.stage2[1]))) #coefA.mat <- cbind(coefA*X[,"A"], coefA, 0) #if(identical(family.stage2[1], binomial) | identical(family.stage2[1],"binomial")){ # Q <- qlogis(bound(plogis(Q+coefA.mat), c(alpha, 1-alpha))) #} else { # Q <- bound(Q+coefA.mat, Qbounds) #} return(list(Q=Q, Y=Y, Qbounds=Qbounds, ab=ab, family=family.stage2)) } #--------------function stage2----------------- # purpose: construct candidate cTMLE estimators # based on Stage 1 fit and candidate g-hat nuisance parameter # estimators constructed within this function # arguments: # Y - outcome vector (training set passed in from cv) # X - design matrix (training set for cv) # Q - predicted values on training set # family - binomial for binary outcomes, gaussian for continuous outcomes # a vector of length 2 here, though. The first is the family to use for # fluctuating the initial estimate of Q. The second is the original family # setting. These are only different when the inputs are mapped to Ystar # In this case fluctuations are carried out on the logistic scale, but # the initial projection onto A + offset(QAW) is carried out on the gaussian. # One could argue that this projection operation properly belongs in stage1, # but then one would be unable to cross-validate the projection, so one would # be correct, but worse off. # Qbounds - respect constraints on Y after projecting on A + QAW, but do not enforce # after targeting with TMLE, as this would not then solve the eff IC equation. # ncandidates - max number of candidates to construct # training_set # like_type = RSS or loglike # enrich - boolean, whether or not to enrich covariate set # gbound - bounds on predicted values for g1W # returns: # candidates - candidates constructed by forward selection # results.all- values of RSS, varIC, estimate, for each candidate, including unadj. #---------------------------------------------- stage2 <- function(Y, X, Q, family, Qbounds, ab, ncandidates, training_set=rep(T,length(Y)), like_type, gbound,verbose=FALSE) { if(verbose) { cat ("\n\t\t-----Stage 2: Constructing candidate TMLE estimators-----\n\n") } candidates <- construct_candidates(Y, X, Q, family[1], ncandidates=ncandidates, terms=colnames(X)[-1], training_set=training_set, gbound=gbound, like_type=like_type, verbose=verbose) results.all <- evaluate_candidates(Y, X, Q, family[1], g.dataset=X, candidates, ab=ab, ncandidates=ncandidates, like_type=like_type, gbound=gbound, training_set=training_set) return(list(candidates=candidates, results.all=results.all)) } #--------------function cv----------------- # purpose: cross-validate stage 2 of the C-TMLE procedure # split data into folds and track est, RSS, varIC, bias # for each fold # arguments: # Y - outcome # X - design matrix, A in column 1 # est.all - vector of estimates corresponding to candidates fitted on full model # Q - nx2 matrix of initial fitted values, Q0[QAW, Q1W] for EY1 # family - binomial for binary outcomes, gaussian for continuous outcomes # Qbounds - respect constraints on Y # ncandidates - max # candidate nuisance parameter estimators to construct # like_type - "RSS" or "log_like", for negloglikelihood # gbound - truncation level for g # returns: est, cv-likelihood, cv-varIC, cv-bias for each estimator, and selected candidate #--------------------------------------------- # changed from varIC to varDstar, 12/7/11 cv <- function(Y,X, est.all, Q, family, Qbounds, ab, ncandidates, like_type, gbound, verbose=FALSE, PEN) { V <- 5 n <- length(Y) nconstructed <- length(est.all)-1 folds <- by(sample(1:n,n), rep(1:5, length=n), list) likelihood <- varDstar <- bias <- c(rep(0,nconstructed+1)) est <- matrix(data=NA, nrow=nconstructed+1, ncol=V) if(verbose) {cat("\tfold: ")} for (v in 1:V){ if(verbose) {cat(v," ")} test_set <- folds[[v]] training_set <- rep(TRUE, n) training_set[folds[[v]]] <- FALSE candidates <- stage2(Y,X, Q, family=family, Qbounds = Qbounds, ab=ab, ncandidates=max(nconstructed), training_set=training_set, like_type=like_type, gbound=gbound, verbose=FALSE) end <- length(candidates$results.all$est) est[1:end,v] <- candidates$results.all$est likelihood <- c(likelihood[1:end]+candidates$results.all$likelihood, rep(Inf, nconstructed[1]+1-end)) varDstar <- c(varDstar[1:end]+candidates$results.all$varDstar, rep(Inf, nconstructed[1]+1-end)) bias <- c(bias[1:end]+(candidates$results.all$est-est.all[1:end]),rep(Inf, nconstructed[1]+1-end)) } pen <- varDstar*(diff(ab))^2/V + n*(bias/V)^2 if(identical(family[1], gaussian) | identical(family[1], "gaussian")){ pen <- pen * log(n) } if(PEN){ score <- likelihood*(diff(ab)^2) + pen } else { score <- likelihood*(diff(ab)^2) } score[is.infinite(score)|is.nan(score)] <- Inf best_k <- max(which(score == min(score, na.rm=TRUE))) #best_k <- max(which(score[-1] == min(score[-1], na.rm=TRUE)))+1 # insist on at least one move if(verbose){ cat("\n\n") cat("\t terms: ", candidates$candidates$terms, "\n") cat("\t all estimates: ", est.all, "\n") cat("\t best_k: ", best_k, "\n\n") } return(list(best_k=best_k, est=est, likelihood=likelihood, like_type=like_type, penlikelihood=score, varIC=varDstar/V, bias=bias/V, pen=pen)) } #--------------function cTMLE---------------- # purpose: simplified version of cTMLE estimation procedure # stage 1 - use Superlearner (SL) to fit initial Q (glm if SL not available) # stage 2 - forward selection-based construction of candidate tmles # cross validate to select best stage 2 candidate for the given stage 1 # arguments: # Y - outcome vector # A - binary treatment indicator (1=treatment, 0=control) # W - covars to include in stage1 estimation of Q (matrix or dataframe ok) # Wg - covars to include in stage2 (defaults to W if not supplied by user) # Q - optional matrix of initial values for Q(W) (all columns besides the first are ignored) # Qform - optional regression formula for estimating initial Q # Qbounds - bound on initial Y and predicted values for Q. # cvQinit - if TRUE, cross-validate initial values for Q to avoid overfits # alpha - bound predicted probabilities away from (0,1), 0.995 (default) # (only used when family=binomial or for logistic fluctuation) # family- binomial for Y* calculation, gaussian for continuous outcomes # SL.library - optional vector of prediction algorithms for data adaptive estimation of Q # defaults to glm, DSA, step, and ridge(continuous outcomes) or knn(binary outcomes) # like_type - RSS or loglike - to use for forward selection and cross-validation # gbound - bound on P(A=1|W), defaults to 0.025 # enrich - TRUE to carry out stage 2 covar enrichment to create higher order terms, default FALSE # reduce - TRUE to screen out covariates in W not predictive of Y given initial Q ,default FALSE # fluctuation - "logistic" (default) or "linear", for targeting step. # verbose - print status messages if TRUE # detailed- return # returns: # candidates - all candidate info from stage 2 # best_k - as selected by cross-validation # est - estimate of psi_0 # CI - IC-based 95% confidence interval for parameter estimate # pvalue # likelihood - sum of squared residuals, based on selected estimator evaluated on all obs # or, logistic loglikelihood if like_type != "RSS" # varIC - empirical variance of the influence curve adjusted for estimation of g # varDstar - empirical variance of the influence curve # var.psi - varianceo of the estimate # varIC.cv - cross-validated variance of the influence curve # penlikelihood.cv - penalized cross-validatedlikelihood # cv.res - all cross-validation results for each fold #------------------------------------------------------------ ctmle_EY1 <- function(Y, Delta, W, Wg=W, Q=NULL, Qform=NULL,Qbounds=NULL, cvQinit=FALSE, alpha=.995, family= "gaussian", SL.library=NULL, gbound=.025,like_type = "RSS", enrich = FALSE, reduce=FALSE, fluctuation="logistic", verbose=FALSE, detailed=FALSE, PEN=TRUE) { n <- length(Y) maptoYstar <- fluctuation=="logistic" if(is.null(colnames(W))){ colnames(W) <- paste("W",1:NCOL(W), sep="") } invalid.name <- nchar(colnames(W)) == 0 if(any(invalid.name)){ colnames(W)[invalid.name] <- paste(".internalW", which(invalid.name), sep="") } if(is.null(colnames(Wg))){ colnames(Wg) <- paste("W",1:NCOL(Wg), sep="") } invalid.name <- nchar(colnames(Wg)) == 0 if(any(invalid.name)){ colnames(Wg)[invalid.name] <- paste(".internalW", which(invalid.name), sep="") } if(!is.null(Q)){ if(NCOL(Q)>1){ Q <- Q[,1] } else { Q <- as.matrix(Q) } } Qinit <- stage1_EY1(Y,X=cbind(Delta,W), Qform=Qform, family, Qbounds=Qbounds, Qinit=Q, alpha=alpha, SL.library, cvQinit=cvQinit,maptoYstar=maptoYstar, verbose) if (enrich | reduce){ Wg <- enrich_W(Y,A,W=Wg,Qinit$Q, family=family[1], enrich, reduce,verbose=verbose) } candidates <- stage2(Y=Qinit$Y, X=data.frame(Delta, Wg), Qinit$Q, family=Qinit$family, Qinit$Qbounds,ab=Qinit$ab, ncandidates=NCOL(Wg), training_set=rep(TRUE,length(Y)), like_type=like_type,gbound=gbound, verbose=verbose) if(verbose) {cat("\t\t-----Cross-validation to select estimator-----\n")} cv.res <- cv(Y=Qinit$Y,X=data.frame(Delta,Wg), est.all=candidates$results.all$est,Q=Qinit$Q, family=Qinit$family, Qbounds=Qinit$Qbounds, ab=Qinit$ab, ncandidates=sum(!is.na(candidates$results.all$varIC)), like_type=like_type,gbound=gbound, verbose=verbose, PEN=PEN) Q.init <- Qinit$Q if(family == "binomial" | maptoYstar){Q.init <- plogis(Q.init)} psi.init <- mean(Q.init) * diff(Qinit$ab)+ Qinit$ab[1] if(verbose) { cat("\tEstimate based on initial, untargeted Q:", round(psi.init,4),"\n") cat(paste("\tCross validation complete.\n")) } # add varDstar and varIC on the original scale candidates$results.all$varIC_origscale <- candidates$results.all$varIC * ifelse(maptoYstar, (diff(Qinit$ab))^2, 1) candidates$results.all$varDstar_origscale <- candidates$results.all$varDstar * ifelse(maptoYstar, (diff(Qinit$ab))^2, 1) returnVal <- (list(candidates=candidates, best_k=cv.res$best_k, est= candidates$results.all$est[cv.res$best_k], CI = candidates$results.all$est[cv.res$best_k]+c(-1.96,1.96) * sqrt(candidates$results.all$varIC_origscale[cv.res$best_k]/n), Qinit=Qinit, ab = Qinit$ab, likelihood= candidates$results.all$likelihood[cv.res$best_k], varIC = candidates$results.all$varIC_origscale[cv.res$best_k], varDstar = candidates$results.all$varDstar_origscale[cv.res$best_k], varIC.cv=cv.res$varIC[cv.res$best_k], var.psi=candidates$results.all$varIC_origscale[cv.res$best_k]/n, penlikelihood.cv = cv.res$penlikelihood[cv.res$best_k], cv.res=cv.res)) returnVal$pvalue <- 2*pnorm(-abs(returnVal$est)/sqrt(returnVal$var.psi)) just_enough <- c(2:4,6,8,9,11,14) if (!detailed) { returnVal <- returnVal[just_enough] } class(returnVal) <- "ctmle" return(returnVal) }