HW 13 Solutions =============== ## (A) > library(faraway) > debt2 = debt[!is.na(debt$prodebt),-9] > sapply(debt2, function(col) sum(is.na(col))) incomegp house children singpar agegp bankacc bsocacc manage 16 2 0 0 1 24 54 3 cigbuy xmasbuy locintrn prodebt 16 18 11 0 > debt3 = cbind.data.frame(debt2, savings = ifelse(is.na(debt2$bankacc), debt2$bsocacc, ifelse(is.na(debt2$bsocacc), debt2$bankacc, pmax(debt2$bankacc, debt2$bsocacc)))) > names(debt3) [1] "incomegp" "house" "children" "singpar" "agegp" "bankacc" [7] "bsocacc" "manage" "cigbuy" "xmasbuy" "locintrn" "prodebt" [13] "savings" > debt.edt = debt3[,-c(6,7)] > sapply(debt.edt, function(col) sum(is.na(col))) incomegp house children singpar agegp manage cigbuy xmasbuy 16 2 0 0 1 3 16 18 locintrn prodebt savings 11 0 20 > sapply(debt.edt, class) incomegp house children singpar agegp manage cigbuy xmasbuy "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" locintrn prodebt savings "numeric" "numeric" "numeric" > table(apply(debt.edt,1,function(row) sum(is.na(row)))) 0 1 2 3 4 355 46 15 1 2 > debt.edt = debt.edt[ apply(debt.edt,1, function(row) sum(is.na(row)))==0, ] dim(debt.edt) [1] 355 11 > names(debt.edt) [1] "incomegp" "house" "children" "singpar" "agegp" "manage" [7] "cigbuy" "xmasbuy" "locintrn" "prodebt" "savings" ### Directly do the selection including up to pairwise interactions > Dmod0 = lm(prodebt ~ ., data=debt.edt) Dmod1 = step(Dmod0, direction="backward", k=4) > round(summary(Dmod1)$coef,5) Estimate Std. Error t value Pr(>|t|) (Intercept) 4.14424 0.24680 16.79166 0.00000 incomegp 0.11212 0.02599 4.31369 0.00002 agegp -0.08095 0.03836 -2.11050 0.03552 manage -0.13827 0.03988 -3.46709 0.00059 locintrn -0.11294 0.04000 -2.82339 0.00502 > Dmod2 = step(Dmod1,scope=list(lower=Dmod1$formula, upper=prodebt ~ .^2), direction="both", k=4) > round(summary(Dmod2)$coef,5) Estimate Std. Error t value Pr(>|t|) (Intercept) 3.36340 0.39714 8.46903 0.00000 incomegp 0.38374 0.11175 3.43385 0.00067 agegp -0.08562 0.03812 -2.24629 0.02531 manage 0.05659 0.08748 0.64698 0.51807 locintrn -0.11577 0.03972 -2.91439 0.00379 incomegp:manage -0.06585 0.02636 -2.49803 0.01295 > c(AIC(Dmod0), AIC(Dmod1), AIC(Dmod2)) [1] 730.2010 727.7678 723.4765 > c(summary(Dmod0)$r.sq, summary(Dmod1)$r.sq, summary(Dmod2)$r.sq) [1] 0.1485375 0.1252797 0.1406450 > c(cor(debt.edt$prodebt, Dmod0$fit), cor(debt.edt$prodebt, Dmod1$fit), cor(debt.edt$prodebt, Dmod2$fit))^2 [1] 0.1485375 0.1252797 0.1406450 > plot(debt.edt$prodebt, Dmod0$fit, xlab="Outcome", ylab="Fitted", main=paste("Overplotted Fitted Values, 3 Models \n", "using debt Data-frame in R Package faraway")) lines(lowess(debt.edt$prodebt, Dmod0$fit)) lines(lowess(debt.edt$prodebt, Dmod1$fit), lty=3, col="red") lines(lowess(debt.edt$prodebt, Dmod2$fit), lty=5, col="blue") ### Small differences between these "lowess" fitted lines > plot(Dmod2) #### looks OK, but can still check for nonlinear dependence on predictors > plot(debt.edt$agegp, Dmod2$resid) > plot(debt.edt$locintrn, Dmod2$resid) > plot(debt.edt$incomegp, Dmod2$resid) ### These additional plots still show reasonable fit ##--------------------------------------------------------------- ##(B) > dim(esoph) [1] 88 5 > names(esoph) [1] "agegp" "alcgp" "tobgp" "ncases" "ncontrols" > esoph[1:25,] agegp alcgp tobgp ncases ncontrols 1 25-34 0-39g/day 0-9g/day 0 40 2 25-34 0-39g/day 10-19 0 10 3 25-34 0-39g/day 20-29 0 6 4 25-34 0-39g/day 30+ 0 5 5 25-34 40-79 0-9g/day 0 27 6 25-34 40-79 10-19 0 7 7 25-34 40-79 20-29 0 4 8 25-34 40-79 30+ 0 7 9 25-34 80-119 0-9g/day 0 2 10 25-34 80-119 10-19 0 1 11 25-34 80-119 30+ 0 2 12 25-34 120+ 0-9g/day 0 1 13 25-34 120+ 10-19 1 1 14 25-34 120+ 20-29 0 1 15 25-34 120+ 30+ 0 2 16 35-44 0-39g/day 0-9g/day 0 60 17 35-44 0-39g/day 10-19 1 14 18 35-44 0-39g/day 20-29 0 7 19 35-44 0-39g/day 30+ 0 8 20 35-44 40-79 0-9g/day 0 35 21 35-44 40-79 10-19 3 23 22 35-44 40-79 20-29 1 14 23 35-44 40-79 30+ 0 8 24 35-44 80-119 0-9g/day 0 11 25 35-44 80-119 10-19 0 6 > sapply(esoph,class) $agegp [1] "ordered" "factor" $alcgp [1] "ordered" "factor" $tobgp [1] "ordered" "factor" $ncases [1] "numeric" $ncontrols [1] "numeric" > contrasts(esoph$agegp) .L .Q .C ^4 ^5 [1,] -0.5976143 0.5455447 -0.3726780 0.1889822 -0.06299408 [2,] -0.3585686 -0.1091089 0.5217492 -0.5669467 0.31497039 [3,] -0.1195229 -0.4364358 0.2981424 0.3779645 -0.62994079 [4,] 0.1195229 -0.4364358 -0.2981424 0.3779645 0.62994079 [5,] 0.3585686 -0.1091089 -0.5217492 -0.5669467 -0.31497039 [6,] 0.5976143 0.5455447 0.3726780 0.1889822 0.06299408 > tmp = factor(sample(1:6,20,replace=T)) > tmp2 = C(tmp, poly) > contrasts(tmp2) .L .Q .C ^4 ^5 [1,] -0.5976143 0.5455447 -0.3726780 0.1889822 -0.06299408 [2,] -0.3585686 -0.1091089 0.5217492 -0.5669467 0.31497039 [3,] -0.1195229 -0.4364358 0.2981424 0.3779645 -0.62994079 [4,] 0.1195229 -0.4364358 -0.2981424 0.3779645 0.62994079 [5,] 0.3585686 -0.1091089 -0.5217492 -0.5669467 -0.31497039 [6,] 0.5976143 0.5455447 0.3726780 0.1889822 0.06299408 #### This type of "polynomial" contrasts is the default for an ordered factor of 6 levels ### and determines how the model matrix is filled. > EsophMod1 Call: glm(formula = cbind(ncases, ncontrols) ~ ., family = binomial, data = esoph) Coefficients: (Intercept) agegp.L agegp.Q agegp.C agegp^4 agegp^5 -1.77997 3.00534 -1.33787 0.15307 0.06410 -0.19363 alcgp.L alcgp.Q alcgp.C tobgp.L tobgp.Q tobgp.C 1.49185 -0.22663 0.25463 0.59448 0.06537 0.15679 Degrees of Freedom: 87 Total (i.e. Null); 76 Residual Null Deviance: 227.2 Residual Deviance: 53.97 AIC: 225.5 > round(summary(EsophMod1)$coef,4) Estimate Std. Error z value Pr(>|z|) (Intercept) -1.7800 0.1980 -8.9916 0.0000 agegp.L 3.0053 0.6522 4.6083 0.0000 agegp.Q -1.3379 0.5911 -2.2633 0.0236 agegp.C 0.1531 0.4485 0.3413 0.7329 agegp^4 0.0641 0.3088 0.2076 0.8356 agegp^5 -0.1936 0.1954 -0.9911 0.3216 alcgp.L 1.4919 0.1994 7.4836 0.0000 alcgp.Q -0.2266 0.1795 -1.2624 0.2068 alcgp.C 0.2546 0.1591 1.6008 0.1094 tobgp.L 0.5945 0.1942 3.0608 0.0022 tobgp.Q 0.0654 0.1881 0.3475 0.7282 tobgp.C 0.1568 0.1866 0.8404 0.4007 ### This suggests that we do not need quite so much detail on the ordered factor levels, ### although all three are important. > anova(EsophMod1) Analysis of Deviance Table Model: binomial, link: logit Response: cbind(ncases, ncontrols) Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 87 227.241 agegp 5 88.128 82 139.112 alcgp 3 74.541 79 64.572 tobgp 3 10.599 76 53.973 ### all highly significant except tobacco only slightly so. But the coefficients ### suggest that perhaps only the overall fact of smoking is important. > EsophMod2 = step(EsophMod1, scope=list(lower=cbind(ncases, ncontrols) ~ 1, upper=cbind(ncases, ncontrols) ~ .^2), k=4) Call: glm(formula = cbind(ncases, ncontrols) ~ agegp + alcgp, family = binomial, data = esoph) Coefficients: (Intercept) agegp.L agegp.Q agegp.C agegp^4 agegp^5 -1.9105 2.9179 -1.3333 0.1633 0.1432 -0.1869 alcgp.L alcgp.Q alcgp.C 1.5707 -0.2057 0.2583 Degrees of Freedom: 87 Total (i.e. Null); 79 Residual Null Deviance: 227.2 Residual Deviance: 64.57 AIC: 230.1 ### NOTE: slightly WORSE AIC (k=2) with this 2nd model, and no interactions!! ### NOW look at the possibility of lower-order nonlinearities with interactions! > c(EsophMod1$null.dev-2, logLik(EsophMod1), logLik(EsophMod2)) [1] 225.2406 -100.7270 -106.0263 > > AIC(EsophMod1) [1] 225.454 > logLik(EsophMod1) 'log Lik.' -100.727 (df=12) > logLik(EsophMod2) 'log Lik.' -106.0263 (df=9) > esoph2 = esoph esoph2$agegp = 20+10*as.numeric(esoph2$age) esoph2$alcgp = 40*as.numeric(esoph2$alcgp)-20 esoph2$tobgp = 10*as.numeric(esoph2$tobgp)-5.5 names(esoph2)[1:3] = c("age","alc","tob") esoph2 = cbind.data.frame(esoph2, alcsq = esoph2$alc^2, alc3rd = esoph2$alc^3, tobsq = esoph2$tob^2, tob3rd = esoph2$tob^3) > EsophMod3A = step(glm(cbind(ncases, ncontrols) ~ ., family = binomial, data = esoph2), direction="backward", k=4) > EsophMod3A Call: glm(formula = cbind(ncases, ncontrols) ~ age + alc + tob, family = binomial, data = esoph2) Coefficients: (Intercept) age alc tob -6.15542 0.05287 0.01735 0.02745 Degrees of Freedom: 87 Total (i.e. Null); 84 Residual Null Deviance: 227.2 Residual Deviance: 73.96 AIC: 229.4 > logLik(EsophMod3A) 'log Lik.' -110.7197 (df=4) #### Better log-likelihood, with far fewer parameters > round(summary(EsophMod3A)$coef, 4) Estimate Std. Error z value Pr(>|z|) (Intercept) -6.1554 0.5037 -12.2201 0e+00 age 0.0529 0.0072 7.3552 0e+00 alc 0.0173 0.0021 8.3174 0e+00 tob 0.0274 0.0081 3.3993 7e-04 > EsophMod3B = step(EsophMod3A, scope=list(upper=cbind(ncases, ncontrols) ~ (age+alc+tob)^2), direction="forward", k=4) Start: AIC=237.44 cbind(ncases, ncontrols) ~ age + alc + tob Df Deviance AIC 73.959 237.44 + alc:tob 1 70.852 238.33 + age:alc 1 71.913 239.39 + age:tob 1 73.951 241.43 #### So NO interaction terms ! ### It might be that greater detail in age and the amounts of alcohol and tobacco might allow ### nonlinear terms and interactions to be more significant. ### `Perfect' solution should do some model-checking, maybe graphical > plot(1:88, cumsum(esoph2$ncases[order(EsophMod3A$fit)])/ cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod3A$fit)]), type="l", xlab="Ordered Subjects", ylab="Cum Case-Rate", main=paste( "Overplotted Empirical and Predicted Cum Case-Rate \n", "for Subjects Ordered by EsophMod3A Score")) lines(1:88, cumsum(EsophMod3A$fit[order(EsophMod3A$fit)]*(esoph2$ncases+ esoph2$ncontrols)[order(EsophMod3A$fit)])/ cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod3A$fit)]), lty=3, col="blue") #### This plotted blue line shows that there is a lot of improvement in this model, #### especially among the subjects with lowest predicted case-rate !! > plot(1:88, cumsum(esoph2$ncases[order(EsophMod1$fit)])/ cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod1$fit)]), type="l", xlab="Ordered Subjects", ylab="Cum Case-Rate", main=paste( "Overplotted Empirical and Predicted Cum Case-Rate \n", "for Subjects Ordered by EsophMod1 Score")) lines(1:88, cumsum(EsophMod1$fit[order(EsophMod1$fit)]*(esoph2$ncases+ esoph2$ncontrols)[order(EsophMod1$fit)])/ cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod1$fit)]), lty=3, col="blue") ### This model is MUCH better with respect to the graphical diagnostic, showing that ### parsimony comes at a price in this case !! LogLik does not always tell the whole story.