Discussion of Time Dependent Covariate Data Structure for "coxph" in R
======================================================================
E. Slud,  12/9/2021

This material is adapted and from the TimDep.RLog handout, topic (8) in the 
Statistical Computing Handouts.

> library(survival)
> data(cancer)   ## "NCCTG lung cancer study" = Mayo lung cancer study
  data(lung)     ## same dataset, 228 x 10
> names(lung)
 [1] "inst"      "time"      "status"    "age"       "sex"       "ph.ecog"  
 [7] "ph.karno"  "pat.karno" "meal.cal"  "wt.loss"  

  lung$status = lung$status-1    ### was originally coded 1,2

## Objective here is to take a dataset with only baseline covariates and
#   code it into a (start, stop) format to handle the covariate 
> sextim = (2-cancer$sex)*log(cancer$time)

## The function "tmerge" in the "survival" package can do this. We do it by hand 
##   below, for the particular case of deterministically time-varying covariate.

> ordtim = sort(unique(lung$time))             ## 186 unique times
  maxind = c(outer(lung$time, ordtim, function(x,y)  x>= y) %*% rep(1,186)) 
   ## these are the indices of the 228 patients' times among the ordered times

> sum(abs(ordtim[maxind]-lung$time))     ## this is the verification
[1] 0

  repind = rep(1:228,maxind) ## repeat each index i maxind[i] times
  length(repind)
[1] 20022       

## build up the vectors "start" and "stop" times step by step, in a for-loop

> start = NULL; stop = NULL
  for(i in 1:nrow(lung))    {
      stop = c(stop, ordtim[1:maxind[i]])
      start = c(start, c(0,ordtim)[1:maxind[i]])   }
  newfram = cbind.data.frame(start=start, stop=stop, status = 0,
      lung[repind, c(5:7,10)], sextim = 
      (2-lung$sex[repind])*log(stop))

> newfram$status[cumsum(maxind)] = lung$status
    0     1 
19857   165             ### agrees with 165=sum(lung$status)

### Now have a 10022 x 8 data-frame, with the following columns
> names(newfram)
[1] "start"    "stop"     "status"   "sex"      "ph.ecog"  "ph.karno" "wt.loss" 
[8] "sextim"
 
## Finally, we can do the time-dependent Cox-model fit, which we compare with 
#   the same Cox-model fit wuthout the time-dependent covariate

> lungA = coxph(Surv(time,status) ~ sex + ph.ecog +
      ph.karno + wt.loss, data=lung)
  lungA
Call:
coxph(formula = Surv(time, status) ~ sex + ph.ecog + ph.karno + 
    wt.loss, data = lung)

              coef exp(coef)  se(coef)      z        p
sex      -0.623193  0.536229  0.176389 -3.533 0.000411
ph.ecog   0.753428  2.124270  0.194168  3.880 0.000104
ph.karno  0.013814  1.013910  0.009908  1.394 0.163241
wt.loss  -0.009008  0.991032  0.006511 -1.383 0.166516

Likelihood ratio test=31.06  on 4 df, p=2.97e-06
n= 213, number of events= 151 
   (15 observations deleted due to missingness)

> lungB = coxph(Surv(start,stop,status) ~sex + ph.ecog +
      ph.karno + wt.loss, data=newfram)       
         ## uses time-dependent framework but has no time-dependent covariates
  lungB
Call:
coxph(formula = Surv(start, stop, status) ~ sex + ph.ecog + ph.karno + 
    wt.loss, data = newfram)

              coef exp(coef)  se(coef)      z        p
sex      -0.623193  0.536229  0.176389 -3.533 0.000411
ph.ecog   0.753428  2.124270  0.194168  3.880 0.000104
ph.karno  0.013814  1.013910  0.009908  1.394 0.163241
wt.loss  -0.009008  0.991032  0.006511 -1.383 0.166516

Likelihood ratio test=31.06  on 4 df, p=2.97e-06
n= 19303, number of events= 151 
   (719 observations deleted due to missingness)

> lungC = coxph(Surv(start,stop,status) ~sex + ph.ecog +
      ph.karno + wt.loss + sextim, data=newfram)   
  lungC
Call:
coxph(formula = Surv(start, stop, status) ~ sex + ph.ecog + ph.karno + 
    wt.loss + sextim, data = newfram)

              coef exp(coef)  se(coef)      z        p
sex      -2.006232  0.134495  1.127879 -1.779 0.075278
ph.ecog   0.735771  2.087091  0.194670  3.780 0.000157
ph.karno  0.012474  1.012553  0.009965  1.252 0.210615
wt.loss  -0.009612  0.990434  0.006577 -1.462 0.143877
sextim   -0.255592  0.774458  0.204005 -1.253 0.210254

Likelihood ratio test=32.76  on 5 df, p=4.204e-06
n= 19303, number of events= 151 
   (719 observations deleted due to missingness)

  ### To account for the patients with some missing data:

> sapply(lung[,c(5:7,10)], function(col) sum(is.na(col)))
     sex  ph.ecog ph.karno  wt.loss 
       0        1        1       14