Discussion of Time Dependent Covariate Data Structure for "coxph" in R ====================================================================== E. Slud, 12/9/2021 This material is adapted and from the TimDep.RLog handout, topic (8) in the Statistical Computing Handouts. > library(survival) > data(cancer) ## "NCCTG lung cancer study" = Mayo lung cancer study data(lung) ## same dataset, 228 x 10 > names(lung) [1] "inst" "time" "status" "age" "sex" "ph.ecog" [7] "ph.karno" "pat.karno" "meal.cal" "wt.loss" lung$status = lung$status-1 ### was originally coded 1,2 ## Objective here is to take a dataset with only baseline covariates and # code it into a (start, stop) format to handle the covariate > sextim = (2-cancer$sex)*log(cancer$time) ## The function "tmerge" in the "survival" package can do this. We do it by hand ## below, for the particular case of deterministically time-varying covariate. > ordtim = sort(unique(lung$time)) ## 186 unique times maxind = c(outer(lung$time, ordtim, function(x,y) x>= y) %*% rep(1,186)) ## these are the indices of the 228 patients' times among the ordered times > sum(abs(ordtim[maxind]-lung$time)) ## this is the verification [1] 0 repind = rep(1:228,maxind) ## repeat each index i maxind[i] times length(repind) [1] 20022 ## build up the vectors "start" and "stop" times step by step, in a for-loop > start = NULL; stop = NULL for(i in 1:nrow(lung)) { stop = c(stop, ordtim[1:maxind[i]]) start = c(start, c(0,ordtim)[1:maxind[i]]) } newfram = cbind.data.frame(start=start, stop=stop, status = 0, lung[repind, c(5:7,10)], sextim = (2-lung$sex[repind])*log(stop)) > newfram$status[cumsum(maxind)] = lung$status 0 1 19857 165 ### agrees with 165=sum(lung$status) ### Now have a 10022 x 8 data-frame, with the following columns > names(newfram) [1] "start" "stop" "status" "sex" "ph.ecog" "ph.karno" "wt.loss" [8] "sextim" ## Finally, we can do the time-dependent Cox-model fit, which we compare with # the same Cox-model fit wuthout the time-dependent covariate > lungA = coxph(Surv(time,status) ~ sex + ph.ecog + ph.karno + wt.loss, data=lung) lungA Call: coxph(formula = Surv(time, status) ~ sex + ph.ecog + ph.karno + wt.loss, data = lung) coef exp(coef) se(coef) z p sex -0.623193 0.536229 0.176389 -3.533 0.000411 ph.ecog 0.753428 2.124270 0.194168 3.880 0.000104 ph.karno 0.013814 1.013910 0.009908 1.394 0.163241 wt.loss -0.009008 0.991032 0.006511 -1.383 0.166516 Likelihood ratio test=31.06 on 4 df, p=2.97e-06 n= 213, number of events= 151 (15 observations deleted due to missingness) > lungB = coxph(Surv(start,stop,status) ~sex + ph.ecog + ph.karno + wt.loss, data=newfram) ## uses time-dependent framework but has no time-dependent covariates lungB Call: coxph(formula = Surv(start, stop, status) ~ sex + ph.ecog + ph.karno + wt.loss, data = newfram) coef exp(coef) se(coef) z p sex -0.623193 0.536229 0.176389 -3.533 0.000411 ph.ecog 0.753428 2.124270 0.194168 3.880 0.000104 ph.karno 0.013814 1.013910 0.009908 1.394 0.163241 wt.loss -0.009008 0.991032 0.006511 -1.383 0.166516 Likelihood ratio test=31.06 on 4 df, p=2.97e-06 n= 19303, number of events= 151 (719 observations deleted due to missingness) > lungC = coxph(Surv(start,stop,status) ~sex + ph.ecog + ph.karno + wt.loss + sextim, data=newfram) lungC Call: coxph(formula = Surv(start, stop, status) ~ sex + ph.ecog + ph.karno + wt.loss + sextim, data = newfram) coef exp(coef) se(coef) z p sex -2.006232 0.134495 1.127879 -1.779 0.075278 ph.ecog 0.735771 2.087091 0.194670 3.780 0.000157 ph.karno 0.012474 1.012553 0.009965 1.252 0.210615 wt.loss -0.009612 0.990434 0.006577 -1.462 0.143877 sextim -0.255592 0.774458 0.204005 -1.253 0.210254 Likelihood ratio test=32.76 on 5 df, p=4.204e-06 n= 19303, number of events= 151 (719 observations deleted due to missingness) ### To account for the patients with some missing data: > sapply(lung[,c(5:7,10)], function(col) sum(is.na(col))) sex ph.ecog ph.karno wt.loss 0 1 1 14