DATA DESCRIPTION: Log to Examine Alternative Models and Fits for Kyphosis Data is LogLinear vs GLM Setting ===================================== 5/10/03 > kyphA <- read.table("kyphosis", skip=8, header=T) > dim(kyphA) [1] 81 5 ## This is correct: now recode ### to get categorical variables > names(kyphA) [1] "Id" "Kyphosis" "Age" "Number" "Start" > Ncat <- ifelse(kyphA$Number < 4, "A", ifelse(kyphA $Number > 5, "C","B")) > table(Ncat) Ncat A B C 35 35 11 > Scat <- ifelse(kyphA$Start < 8, "a", ifelse(kyphA $Start > 13, "c", "b")) > table(Scat) Scat a b c 17 30 34 > Acat <- ifelse(kyphA$Age <96, ifelse(kyphA$Age < 31,1,2), ifelse(kyphA$Age < 137,3,4)) > table(Acat) Acat 1 2 3 4 22 21 21 17 > kyphB <- cbind.data.frame(kyph=factor(kyphA$Kyphosis), Acat=factor(Acat), Ncat=factor(Ncat), Scat=factor(Scat)) ### Now all four categories are factors. > levels(kyphB$kyph) [1] "absent" "present" ### So "absent" is coded 1, and ### response indicator in glm is "present" #### We have two data-structures, the original with ### ordered (numerical) predictors and the second with ### ordered categories. We compare the analysis of the ### latter, in R, by glm and loglin. ### We begin by doing exploratory analysis with the ### categorically re-coded variables. > glmB <- glm(kyph ~ ., family=binomial, data=kyphB) > anova(glmB) Analysis of Deviance Table, Model: binomial, link: logit Response: kyph, Terms added sequentially Df Deviance Resid. Df Resid. Dev NULL 80 83.234 Acat 3 7.394 77 75.841 Ncat 2 9.458 75 66.383 Scat 2 12.269 73 54.114 ## This makes the age-category least useful: > anova(glm(kyph~ Scat + Ncat + Acat, family=binomial, data=kyphB)) > contrasts(kyphB$Acat) 2 3 4 1 0 0 0 2 1 0 0 3 0 1 0 4 0 0 1 ### Corresponds to "treatment" contrasts > summary(glm(kyph~ Scat + Ncat + Acat, family=binomial, data=kyphB))$coef Estimate Std. Error z value Pr(>|z|) (Intercept) -2.5668 1.3515 -1.8992 0.0575 Acat2 2.4586 1.2558 1.9578 0.0503 Acat3 2.9454 1.2854 2.2915 0.0219 Acat4 2.2259 1.4075 1.5815 0.1138 NcatB 0.2269 0.8218 0.2762 0.7824 NcatC 1.7968 1.0479 1.7146 0.0864 Scatb -1.2477 0.7537 -1.6555 0.0978 Scatc -3.4125 1.1958 -2.8538 0.0043 ### Thus there seems to be a monotone tendency in ### Age (later categories have larger coefficients) ### Ncat (larger Number implies larger coeff) ### Scat (larger Start implies more neg coeff) ### With the exception of Scatc, all individual dummy- ### coefficients are only borderline-significant ### at best. But glm using the numeric predictors ### does better. > summary(glmB)$aic [1] 70.11384 ### based on 8 fitted parameters, so ### logLik = -0.5*(70.114 + 2*8) = -27.057 ### which agrees with > sum(kyphB$y*log(kyphB$fit) + (1-kyphB$y)* log(1-kyphB$fit)) > glmA <- glm(Kyphosis ~ Start + Number + Age, family= binomial, data=kyphA) ## AIC = 69.38, logLik= -31.69 > anova(glmA) Df Deviance Resid. Df Resid. Dev NULL 80 83.234 Start 1 15.162 79 68.072 Number 1 3.536 78 64.536 Age 1 3.157 77 61.380 ### But in this glm it makes more sense to try ### interactions ! > glmA2 <- glm(Kyphosis ~ Start*Number + Age, family= binomial, data=kyphA) #### Neither this nor the other 2nd order interactions seem ### worthwhile ! But quadratic terms in Start and Age DO: > glmA2 <- glm(Kyphosis ~ Start+ I(Start^2) + Number + Age + I(Age^2), family= binomial, data=kyphA) ### AIC = 61.45497, logLik = -25.7275 #### This one seems to be the best model overall !!! > anova(glmA2) Df Deviance Resid. Df Resid. Dev NULL 80 83.234 Start 1 15.162 79 68.072 I(Start^2) 1 6.373 78 61.699 Number 1 1.684 77 60.015 Age 1 3.072 76 56.942 I(Age^2) 1 7.487 75 49.455 ### Residuals Pictures: > plot(kyphA$Age, glmB$resid) > identify(kyphA$Age, glmB$resid) [1] 11 43 46 77 ### obs 11 is very large pos resid, obs 43 has ### moderately large negative one, others moderate positiv. > plot(kyphA$Age, glmA$resid) > identify(kyphA$Age, glmA$resid) [1] 10 11 23 40 43 46 77 ### But 43 is now extreme negative ### residual, with 11 and 23 rather extreme positive ones ! > plot(kyphA$Age, glmA2$resid) ### Now the only very large residuals are observations 77(pos) and 43(neg), with 11 and 25 moderately large positive. ### plotting residuals against Number now shows only 77 and 43 ### very noticeable > cbind.data.frame(obs=kyphA$Kyphosis, Afit=glmA$fit, Bfit=glmB$fit, Age=kyphA$Age, Acat=kyphB$Acat, Start= kyphA$Start, No=kyphA$Number)[c(11,43,46,47, 25, 77),] obs Afit Bfit Age Acat Start No 11 present 0.1214575 0.03578569 82 2 14 5 43 absent 0.9309747 0.81090750 143 4 3 9 46 present 0.2057312 0.16958604 139 4 10 3 47 absent 0.1186181 0.05695073 136 3 15 4 25 present 0.6429469 0.31647929 15 1 2 7 77 present 0.1450918 0.16958604 157 4 13 3 ### All but 11, 25, 43, 77 are very well fitted by glmA2. ### Taking further interaction-terms leads to surrogates ### for dummy-indicators for these individual ### observations: not advisable to do this !!! ======================================================= OK, if this was how well we could do with GLM, how about log-linear models ?? Here the data structure kyphB is the only usable one, and the logLik values have completely different meanings !! ## So as preliminary: plot the predictors against one another ## to check relationships !! > plot(kyphA$Number, kyphA$Start) ### etc. > attach(kyphB) table(Scat, Ncat, Acat) , , Acat = 1 Ncat Scat A B C a 1 2 1 b 2 5 2 c 7 2 0 , , Acat = 2 Ncat Scat A B C a 1 4 1 b 1 3 2 c 4 4 1 , , Acat = 3 Ncat Scat A B C a 2 2 1 b 4 4 1 c 5 2 0 , , Acat = 4 Ncat Scat A B C a 0 0 2 b 2 4 0 c 6 3 0 ### Now begin with loglin model on these ### categories (in this order) plus kyph, ### allowing interactions up to second order ### with kyph and 3-way among the others > lglinB <- loglin(table(Scat, Ncat, Acat, kyph), margin=list(1:3, c(1,4), c(2,4), c(3,4)), param=T, fit=T) 5 iterations: deviation 0.06745102 > lglinB$lrt ### = 22.93809 > loglin(table(Scat, Ncat, Acat, kyph), margin=list(1:3, c(1,4), c(2,4)))$lrt ## 31.247 3 iterations: deviation 0.05945936 > loglin(table(Scat, Ncat, Acat, kyph), margin=list(1:3, c(1,4), c(3,4)))$lrt ## 26.709 4 iterations: deviation 0.02596825 > loglin(table(Scat, Ncat, Acat, kyph), margin=list(1:3, c(2,4), c(3,4)))$lrt ## 35.206 4 iterations: deviation 0.06688744 ### Consider the lrt calculation(in lglinB): > Ctab <- table(Scat, Ncat, Acat, kyph) 2*sum(ifelse(Ctab>0,Ctab*log(Ctab/lglinB$fit),0)) [1] 22.938 ### This says the lrt number is exactly the deviance ### statistic. A reasonable `NULL' model is one ### which is saturated wrt the first 3 responses ### and indep wrt the 4th. > loglin(table(Scat, Ncat, Acat, kyph), margin=list(1:3, c(1,4)))$lrt ## 33.949 2 iterations: deviation 8.881784e-16 > loglin(table(Scat, Ncat, Acat, kyph), + margin=list(1:3, 4))$lrt ## 52.056 2 iterations: deviation 7.105427e-15 ### The loglik for the saturated model is: > sum(ifelse(Ctab>0,Ctab*log(Ctab/81),0))### -277.347 ## So we have the following hierarchy, with variables: ### X=Scat, Z=Ncat, W=Acat, Y=kyph ### Model Deviance logLik ### Saturated 0 -277.347 ### XZW, XY, ZY, WY 22.938 -288.816 ### XZW, XY, WY 26.709 -290.702 ### XZW, XY, ZY 31.247 -292.976 ### XZW, WY, ZY 35.206 -294.956 ### XZW, XY 33.949 -293.627 ### XZW, Y 52.056 -303.881 ## The decrement to deviance due to the Scat variable ## is: 52.056 - 33.949 = 18.107 ### Agrees exactly with: > anova(glm(kyph ~ Scat, family=binomial)) ---------------------------------------------------- SAS Code follows: ---------------------------------------------------- data kyphB (drop= Age Number Start Id); infile "kyphosis"; input id kyph $ Age Number Start; if Age < 96 then if Age < 31 then Acat=1; else Acat=2; else if Age <137 then Acat=3; else Acat=4; if Number < 4 then Ncat="A"; else if Number > 5 then Ncat="C"; else Ncat="B"; if Start < 8 then Scat="a"; else if Start > 13 then Scat="c"; else Scat="b"; if id ne . ; run; proc sort; by kyph Acat Ncat Scat; data kyphC; set kyphB; by kyph Acat Ncat Scat; if first.Scat then count=0; count+1; if last.Scat then output; proc genmod data=kyphB desc; class kyph Acat Ncat Scat ; model kyph = Scat Ncat Acat / dist=bin link=logit type3; /* This is the same model as glmB in Splus above */ proc print; run; Obs kyph Acat Ncat Scat count 1 absent 1 A a 1 2 absent 1 A b 2 3 absent 1 A c 7 4 absent 1 B a 2 5 absent 1 B b 5 6 absent 1 B c 2 7 absent 1 C b 2 8 absent 2 A a 1 9 absent 2 A b 1 10 absent 2 A c 4 11 absent 2 B a 2 12 absent 2 B b 2 13 absent 2 B c 3 14 absent 2 C b 1 15 absent 2 C c 1 16 absent 3 A a 1 17 absent 3 A b 3 18 absent 3 A c 5 19 absent 3 B b 3 20 absent 3 B c 2 21 absent 4 A c 6 22 absent 4 B b 4 23 absent 4 B c 3 24 absent 4 C a 1 25 present 1 C a 1 26 present 2 B a 2 27 present 2 B b 1 28 present 2 B c 1 29 present 2 C a 1 30 present 2 C b 1 31 present 3 A a 1 32 present 3 A b 1 33 present 3 B a 2 34 present 3 B b 1 35 present 3 C a 1 36 present 3 C b 1 37 present 4 A b 2 38 present 4 C a 1 proc catmod data=kyphC; weight count; model kyph*Acat*Ncat*Scat = _response_ / ml nodesign noresponse noiter noprofile; loglin Acat|Ncat|Scat kyph kyph*Acat kyph*Ncat kyph*Scat ; run; /* The output gives ml estimates and SE for each of the cross-classified coefficients and is nearly useless! */