LOG showing coding for Time-Dependent Cox Model Analysis in R using Mayo Lung Cancer Data ============================================== 10/28/05 > library(survival); data(lung) ### 15 out of 228 observations deleted below due to missing data > lung$status <- lung$status-1 ### coded 1,2 initially > lung1 <- coxph(Surv(time,status) ~ sex + ph.ecog +ph.karno + wt.loss, data = lung) ### Sex is really important, so let's look at Stratification by Sex > lung2 <- coxph(Surv(time,status) ~ strata(sex) + ph.ecog + ph.karno + wt.loss, data = lung) > round(lung1$coef,3) sex ph.ecog ph.karno wt.loss -0.623 0.753 0.014 -0.009 > round(lung2$coef,3) ph.ecog ph.karno wt.loss 0.723 0.012 -0.010 ### Not much difference betwen coefficients ! > fits2 <- survfit(lung2) > fits2$strata sex=1 sex=2 92 48 ### Thus in the fits2$time and fits2$surv vectors, we have the 92 ### stratum-1 values concatenated with the 48 stratum-2 values. > plot(fits2$time[1:92], -log(fits2$surv[1:92])) points(fits2$time[93:140], -log(fits2$surv[93:140]), pch=5) ## We can see two rather different curvatures for the nuisance ## (=baseline) hazard in the two sex-groups. > tmpreg <- survreg(Surv(time,status) ~ strata(sex) + ph.ecog + ph.karno + wt.loss, data = lung, dist="weibull") > 1/tmpreg$scale sex=1 sex=2 1.298377 1.597784 ## These "shape parameters are more or less in accord with the nuisance-hazard plot by stratum ---------------------------------------------------------- This suggests that we try to fit a unified time-dependent covariate Cox model, unstratified, but with extra covariate sextim = (2-sex)*log(time) ---------------------------------------------------------- So we code the data into a new data-frame with columns start, stop, status, sex, ph.ecog, ph.karno, wt.loss, and sexlog=(2-sex)*log(time). The start, stop intervals are going to be the ordered distinct failure-times. > ordtim <- as.numeric(names(table(lung$time[lung$status==1]))) ## 139 ordered distinct failure times, within a total of 186 ## ordered distinct event times > names(lung) [1] "inst" "time" "status" "age" "sex" "ph.ecog" [7] "ph.karno" "pat.karno" "meal.cal" "wt.loss" > start <- NULL; stop <- NULL ## create a vector showing the index of the largest ordered ## failure time <= each subject's event time ### NOTE THE FOLLOWING CORRECTED 10/31/05 maxind <- c(outer(lung$time, ordtim, function(x,y) x>= y) %*% rep(1,139)) repind <- rep(1:228,maxind) ## repeat each index i maxind[i] times > length(repind) [1] 16031 ### This will be the number of rows of the new data frame ## now create start, stop as simple concatenated vectors for(i in 1:nrow(lung)) if(maxind[i] > 0) { stop <- c(stop, ordtim[1:maxind[i]]) start <- c(start, c(0,ordtim)[1:maxind[i]]) } newfram <- cbind.data.frame(start=start, stop=stop, lung[repind, c(3,5:7,10)], sextim = (2-lung$sex[repind])*log(stop)) ### Now have a 16031 x 8 data-frame, with the following columns > names(newfram) [1] "start" "stop" "status" "sex" "ph.ecog" "ph.karno" [7] "wt.loss" "sextim" ### NOTE: correction 10/31/05 > newfram$status <- rep(0, nrow(newfram)) newfram$status[cumsum(maxind)] <- lung$status > table(newfram$status) 0 1 15866 165 ### agrees with 165=sum(lung$status) ## Finally, we can do the time-dependent Cox-model fit. > lung3B <- coxph(Surv(start,stop,status) ~sex + ph.ecog + ph.karno + wt.loss, data=newfram) ### This was just a check: the coefficients and logLik ### agree precisely with those of lung1 . > lung3 <- coxph(Surv(start,stop,status) ~ . , data=newfram, init=c(lung1$coef,0)) ## Specified coefficient without time-dependent term as the ## initial value. (This helps convergence.) n=15414 (617 observations deleted due to missing) coef exp(coef) se(coef) z p sex -2.00414 0.135 1.12763 -1.78 0.07600 ph.ecog 0.73578 2.087 0.19467 3.78 0.00016 ph.karno 0.01248 1.013 0.00996 1.25 0.21000 wt.loss -0.00961 0.990 0.00658 -1.46 0.14000 sextim -0.25523 0.775 0.20396 -1.25 0.21000 > lung3$loglik [1] -659.4920 -658.6455 ### Here initial loglik was that from lung1 ! ### This confirms that the time-dependent covariate ### does NOT make a big difference. ### So find that the time-dependent covariate is NOT ### significant! Suggests we did not need separate ### nuisance functions for the two sexes. ### Look again at the plot of the two sex-stratum ### cumulative hazard curves > round(log(tmpreg$scale),4) sex=1 sex=2 -0.2611 -0.4686 > round(sqrt(diag(summary(tmpreg)$var)[5:6]),4) Log(scale) Log(scale) 0.0852 0.1116 > tmpreg$var[5,6]/sqrt(tmpreg$var[5,5]*tmpreg$var[6,6]) [1] -0.0900263 ## very little correlation ### So test of difference between scale parameters ### is not significant ! ### p-value from Weibull stratified regression would have ### been: > c(2*(1-pnorm(c(1,-1) %*% log(tmpreg$scale)/sqrt( c(1,-1) %*% tmpreg$var[5:6,5:6] %*% c(1,-1))))) [1] 0.1563731 ### Compare with p-value .21 in Cox-model setting