LOG OF DATA ANALYSES COMPARING PARAMETRIC REGRESSIONS. ===================================================== > library(MASS) ## library containing VA lung cancer data library(survival) ## survival package table(VA$cell, VA$treat) 1 2 1 15 20 ## Can see from this and Appendix A of K & P 2 30 18 ## that the "cell" types are squamous=1 3 9 18 ## small=2, adeno.=3, and large=4 4 15 12 > names(VA) [1] "stime" "status" "treat" "age" "Karn" "diag.time" [7] "cell" "prior" ## treat, cell, and prior are "factors" ie categorical > VA.Weib <- survreg( Surv(stime, status) ~ . , data=VA, dist="weibull") ## fit vs everything > round(VA.Weib$coef,5) (Intercept) treat2 age Karn diag.time cell2 3.26201 -0.22852 0.00610 0.03007 -0.00047 -0.82618 cell3 cell4 prior10 -1.13273 -0.39768 -0.04390 ### Just like p. 72 in Kalbfleisch and Prentice book, but note that "treat" coeff is # negative because it multiplies the contrast between 2=0 and 1. Also, "cell" contrasts # are versus cell=1, so eg "cell4" represents large minus squamous which is opposite # to the direction given in Table 3.1. > VA.Weib$loglik [1] -748.0912 -715.5513 ## initial and final > round(sqrt(diag(VA.Weib$var)),4) (Intercept) treat2 age Karn diag.time cell2 0.6625 0.1868 0.0086 0.0048 0.0084 0.2463 cell3 cell4 prior10 Log(scale) 0.2576 0.2547 0.2123 0.0663 ## So from this it looks as though [using backwards selection] we could delete ## treat and age and diag.tim and prior but not cell-type. > VA.Weib2 <- survreg( Surv(stime, status) ~ Karn + cell , data=VA, dist="weibull") > VA.Weib2$loglik [1] -748.0912 -716.5149 ## so hardly any logLik difference ## 2*(716.515-715.551) = 1.928 deviance with > 1-pchisq(1.928,3) [1] 0.5874842 ## = p-value ## Now can consider discriminating versus exponential model: ## "scale" = VA.Weib$scale = 0.92812 should be "shape parameter" > tmpfit <- survfit(Surv(VA$stime, VA$status)~1) > plot(log(tmpfit$time[1:100]), log(-log(tmpfit$surv[1:100]))) ### This is close to a line as it should be for Weibull > lm(log(-log(tmpfit$surv[1:100])) ~ log(tmpfit$time[1:100]))$coef (Intercept) log(tmpfit$time[1:100]) -4.1692663 0.8779302 ## Thus the shape parameter is indeed close to .9, but the SE for log(scale)=-.074 # is .066, so model not significantly different from exponential. > survreg( Surv(stime, status) ~ Karn + cell, data=VA, dist="weibull", scale=1)$loglik ## same as fitting with dist="exponential" [1] -751.2212 -716.9721 ### almost unchanged ## But let's over-plot the KM curve with the fitted Weibull, on cum-Hazard scale > plot(tmpfit$time, -log(tmpfit$surv), ylim=c(0,5), xlab="Time", ylab="Cumulative Hazard", main= "VA Lung Cancer Cum Haz, KM vs Weib vs Log-Logistic") Weib2.est <- numeric(100) for(i in 1:100) Weib2.est[i] <- -log(mean(exp(-(exp(-VA.Weib2$lin)* (10*i))^(1/VA.Weib2$scale)))) lines(10*(1:100), Weib2.est, lty=3) ## We need the expect parameterization to do this over-plot ### Now how would we compare other models fitted ? Can use loglik as rough guide [although # non-nested models are not strictly comparable this way], eg dist="extreme" # gives -899.1 while dist="loglogistic" gives -712.6 ?! > VA.LgLgs <- survreg( Surv(stime, status) ~ Karn + cell , data=VA, dist="loglogistic") VA.LgLgs$loglik [1] -750.2658 -712.5941 > lglgst.est <- numeric(100) for(i in 1:100) lglgst.est[i] <- -log(mean(plogis(-(log(10*i)- VA.LgLgs$lin)/VA.LgLgs$scale))) lines(10*(1:100), lglgst.est, lty=5) legend(locator(), legend=c("Weibull","Log-Logistic"), lty=c(3,5)) ## Picture saved as KMWeib.pdf ------------------------------------------------------------------ ## Note that we can recover the log-likelihood calculations in these ## models "by hand", as follows: first, for Weibull: > xvec1 <- (log(VA$stime) - VA.Weib2$lin)/VA.Weib2$scale sum(VA$status*(xvec1 - log(VA$stime*VA.Weib2$scale))- exp(xvec1)) [1] -716.5149 ## which agrees with VA.Weib2$loglik[2] ## Next try for log-logistic: > xvec2 <- (log(VA$stime) - VA.LgLgs$lin)/VA.LgLgs$scale sum(VA$status*log(dlogis(xvec2)/(VA$stime*VA.LgLgs$scale* plogis(-xvec2))) + log(plogis(-xvec2))) [1] -712.594 ### NB This agrees with VA.LgLgs$loglik[2] ----------------------------------------------------------------- NB. IT IS VERY PUZZLING THAT THE VISUAL FIT OF THE WEIBULL MODEL IN THE OVER-PLOTTED CUMULATIVE HAZARD PICTURE IS BETTER THAN THAT FOR THE LOG-LOGISTIC MODEL, WHILE THE LOG-LOGISTIC LOG-LIKELIHOOD IS HIGHER THAN FOR THE WEIBULL REGRESSION MODEL. THE EXPLANATION MUST LIE IN THE CLOSENESS OF FIT OF THE EARLY PART OF THE CUMULATIVE-HAZARD CURVE, SAY UP TO TIME 300. WE CHECK THIS: > timL300 <- (1:137)[VA$stime < 300] ### 124 cases! > sum((VA$status*(xvec1 - log(VA$stime*VA.Weib2$scale))-exp(xvec1))[timL300]) [1] -611.1947 ## log-lik using fitted Weibull parameters > sum((VA$status*log(dlogis(xvec2)/(VA$stime*VA.LgLgs$scale* plogis(-xvec2))) + log(plogis(-xvec2))) [timL300]) [1] -607.5009 ## log-lik using fitted log-logistic THUS THERE IS ADDITIONAL IMPROVEMENT: LOG-LOGISTIC GETS ITS ADVANTAGE ON THE EARLY PART OF THE SURVIVAL AXIS, AND THE WORSE VISUAL FIT IS DUE ONLY TO THE LATER TIMES. GIVEN THAT THE LOG-LIKELIHOODS ARE NOT STRICTLY COMPARABLE BETWEEN THE TWO MODELS (NO WILKS THEOREM APPLIES!), WHICH OF THE TWO MODELS WOULD YOU PREFER ?? ============================================================= Continuation 10/17/05 UPDATED fALL 2021 FURTHER MATERIAL CONCERNS COX-MODEL FIT TO SAME DATA !! ------------------------------------------------------------ > VA.Cox <- coxph(Surv(stime,status) ~ Karn + cell, data=VA) ## Recall that Cox model puts coeff's in hazard, and the previous ## models in logT, so they are negatives ! Also, Cox has no Intercept: > round(rbind(Weib=VA.Weib2$coef, Lgs=VA.LgLgs$coef, Cox=c(0,VA.Cox$coef)),3) (Intercept) Karn cell2 cell3 cell4 Weib 3.481 0.029 -0.708 -1.108 -0.322 Lgs 2.480 0.036 -0.690 -0.778 -0.029 Cox 0.000 -0.031 0.715 1.158 0.326 ## NOTE close correspondence between Cox, Weibull. Now compare their standard errors ! ## Weibull has a "scale" nuisance parameter, Cox a nuisance haz fcn. > round(rbind(Weib=sqrt(diag(VA.Weib2$var)[1:5]), Cox=c(0,sqrt(diag(VA.Cox$var)))),3) (Intercept) Karn cell2 cell3 cell4 Weib 0.34 0.005 0.226 0.253 0.250 Cox 0.00 0.005 0.253 0.293 0.277 ## So relatively little accuracy is lost in fitting Cox model ! > tmpsurv <- survfit(VA.Cox) tmpbase <- basehaz(VA.Cox) > sum(abs(tmpbase$haz + log(tmpsurv$surv))) [1] 2.068658e-15 ## Either of these gives NUISANCE cum haz ## Note: the default is to center all nonconstant covariates at their means. ## Next generate comparative plot for Weib2, LgLgs, and Cox for overall ## summary pop hazard. tmpsfcn <- tmpsurv$surv summCox <- replace(tmpsfcn,T,0) for(i in 1:nrow(VA)) summCox <- summCox + tmpsfcn^exp(VA.Cox$lin[i]) summCox <- summCox/nrow(VA) ### Now we add summary Cox survival to Weibull and logLogistic plot > plot(tmpfit$time, -log(tmpfit$surv), ylim=c(0,5), xlab="Time", ylab="Cumulative Hazard", main=paste( "VA Lung Cancer Cum Haz, KM vs Weib vs Log-Logistic", "\n versus Summary from Cox-Model", sep="")) Weib2.est <- numeric(100) for(i in 1:100) Weib2.est[i] <- -log(mean(exp(-(exp(-VA.Weib2$lin)* (10*i))^(1/VA.Weib2$scale)))) lines(10*(1:100), Weib2.est, lty=3) lines(10*(1:100), lglgst.est, lty=5) lines(tmpsurv$time, -log(summCox), lty=1) legend(locator(), legend=c("Cox","Weibull","Log-Logistic"), lty=c(1,3,5)) > legend(locator(), legend=c("Kaplan-Meier"), pch=1) ### replaced KMWeib.pdf with this picture, that has the additional ### plotted Cox summary survival curve. ============================================================ Small Simulation to illustrate the meaning of the parameters in survreg 3/7/08 ========================================== > set.seed(3131) Xvec = runif(100) rskfac = .04 + .45*Xvec ErrMat = log(array(rweibull(1000*100, 1.2, 0.7), c(1000,100))) Ests = array(0, c(1000,3)) for(i in 1:1000) { Yvec = exp(rskfac + ErrMat[i,]) tmp = survreg(Surv(Yvec,rep(1,100))~ Xvec) Ests[i,] = c(tmp$coef, tmp$scale) } > var(Ests) #### This is empirical based on simulation [,1] [,2] [,3] [1,] 0.029268344 -0.0487377290 -0.0014501565 [2,] -0.048737729 0.1082110337 -0.0003607701 [3,] -0.001450157 -0.0003607701 0.0044056601 > tmp$var ### This is "theoretical" based on observed ### information matrix from last simulation iteration (Intercept) Xvec Log(scale) (Intercept) 0.033667848 -0.0516113297 -0.0014388816 Xvec -0.051611330 0.1047278477 -0.0008612532 Log(scale) -0.001438882 -0.0008612532 0.0044246751 ## NB according to the parameterization of the simulation ## Y = T and log(T) has mean .04 + .45*Z + log(V) ## with V ~ Weib(a=1.2, b=0.7) ## Both survreg and rweibull parameterize the weibull with ## a = shape = 1.2 and b = scale = 0.7. (When I write Weibull in class with cumulative hazard function lambda*t^gamma, the corresponding shape and scale are respectively a = gamma and b = lambda^(-1/shape).) > summary(Ests) V1 V2 V3 Min. :-0.8722 Min. :-0.7244 Min. :0.5740 1st Qu.:-0.4559 1st Qu.: 0.2608 1st Qu.:0.7777 Median :-0.3325 Median : 0.4711 Median :0.8192 Mean :-0.3319 Mean : 0.4716 Mean :0.8202 3rd Qu.:-0.2040 3rd Qu.: 0.6876 3rd Qu.:0.8630 Max. : 0.2866 Max. : 1.5698 Max. :1.0216 ### At this point, we have a theoretical model (which ## we used for simulation, with no censoring), in which ## log(Y) = .04 + .45*Z + log(W0) where W0 has a=1.2, b=0.7. ## This is the same as saying ## log(Y) = (.04+log(0.7)) + .45*Z + (1/1.2)*log(V) ## where V~Expon(1). ## NOTE that .04 + log(0.7) = -0.31667 which closely ## matches the average estimated intercept above; the ## average estimes slope coefficient closely matches ## the true Z-coefficient 0.45; and the coefficient ## 1/1.2 = .8333 closely matches the average estimated ## "scale" parameter in the simulation. ## ## So in general, if log(Y) = c1 + beta'Z + epsilon where ## exp(epsilon) ~ Weibull(a=shape, b=scale), we assert ## Int = c1 + log(b), scale = 1/a. ## (We will check that further below). ## Alternatively, if exp(epsilon) ~ logistic(mu=location, ## sig=scale), we should have Int= c1 + mu, Scale=sig ## First we confirm this with different c1 and (a,b) and ## larger sample size and larger batch size. > Est2 = replace(Ests,T,0) for(i in 1:1000) { tmp = survreg(Surv(exp(rskfac+0.9)* rweibull(1000, 0.8, 0.4),rep(1,1000))~ rep(Xvec,10)) Est2[i,] = c(tmp$coef, tmp$scale) } ### Now c1=.94, a=.8, b=.4, and we compare average ### estimates from those anticipated by formula above > rbind(apply(Est2,2,mean), c(.94+log(.4), .45, 1/.8)) [,1] [,2] [,3] [1,] 0.02838583 0.4459365 1.247923 [2,] 0.02370927 0.4500000 1.250000 ## Now we confirm with a simulation when the regression terms are entirely absent: > Est3 = replace(Ests,T,0) for(i in 1:1000) { tmp = survreg(Surv(rweibull(1000, 0.8, 0.4), rep(1,1000))~ rep(Xvec,10)) Est3[i,] = c(tmp$coef, tmp$scale) } rbind(apply(Est3,2,mean), c(log(.4),0,1/.8)) [,1] [,2] [,3] [1,] -0.9196989 0.006638826 1.247242 [2,] -0.9162907 0.000000000 1.250000 ### OK, excellent agreement, in view of the corresponding standard errors: > sqrt(diag(tmp$var)/1000) (Intercept) Xvec Log(scale) [1] 0.0026780333 0.0046694923 0.0007971386 ### Both in this simulation and another done with batch size 100, we have difference # in scale-parameter case which is nearly significant: but there is often a bias ### in MLE of order 1/sample-size, so we are OK. ### Now we do one more simulation, checking agreement in loglogistic case: > Est4 = replace(Ests,T,0) for(i in 1:1000) { tmp = survreg(Surv(exp(rskfac+rlogis(1000, 0.6, 1.3)), rep(1,1000))~ rep(Xvec,10), dist="loglogis") Est4[i,] = c(tmp$coef, tmp$scale) } rbind(apply(Est4,2,mean), c(.04+.6,.45,1.3)) [,1] [,2] [,3] [1,] 0.6465393 0.4429854 1.297658 [2,] 0.6400000 0.4500000 1.300000 > sqrt(diag(tmp$var)/1000) (Intercept) rep(Xvec, 10) Log(scale) [1] 0.0047180475 0.0082511987 0.0008326295 #### So we still have very good agreement. #### WE HAVE DESCRIBED THE INTERCEPT AND SCALE PARAMETERS estimated by the survreg FUNCTION. ### NOTE: we have seen how to work with dist="weibull" ### or "loglogistic", and "lognormal" is just like ### loglogistic in having ### location parameter for epsilon = Intercept and ### scale parameter for epsilon = scale. For all ### of these variables, it is understood that the ### response is log(time). Certain other ### distributions with the same log(time) response ### are simply special cases with fixed "scale" ### parameter, i.e. "exponential" coincides with ### the choices dist="weibull" and scale=1 (fixing ### the scale parameter), and "rayleigh" to dist= ### "weibull" and scale=0.5 (corresponding to ### Weibull shape = 2). ### For other standard choices of "dist", namely ### the same model with be used with Y = "time", ### you can use dist="extreme","logistic","gaussian", ### or "t". Note that if the underlying data-frame has ### a column called "time" and its logarithm in a ### column called "logtim", then calling the function survreg(Surv(logtim,status), dist="logistic", ...) ### is equivalent to calling survreg(Surv(time,status), dist="loglogistic") ### and similarly, calling survreg(Surv(logtim,status), dist="gaussian", ...) ### is equivalent to calling survreg(Surv(time,status), dist="lognormal")