HW 13 Solutions
===============

## (A) 

> library(faraway)
> debt2 = debt[!is.na(debt$prodebt),-9]
> sapply(debt2, function(col) sum(is.na(col)))
incomegp    house children  singpar    agegp  bankacc  bsocacc   manage 
      16        2        0        0        1       24       54        3 
  cigbuy  xmasbuy locintrn  prodebt 
      16       18       11        0 


> debt3 = cbind.data.frame(debt2, savings = ifelse(is.na(debt2$bankacc),
             debt2$bsocacc, ifelse(is.na(debt2$bsocacc), debt2$bankacc,
             pmax(debt2$bankacc, debt2$bsocacc))))
> names(debt3)
 [1] "incomegp" "house"    "children" "singpar"  "agegp"    "bankacc" 
 [7] "bsocacc"  "manage"   "cigbuy"   "xmasbuy"  "locintrn" "prodebt" 
[13] "savings" 

> debt.edt = debt3[,-c(6,7)]
> sapply(debt.edt, function(col) sum(is.na(col)))
incomegp    house children  singpar    agegp   manage   cigbuy  xmasbuy 
      16        2        0        0        1        3       16       18 
locintrn  prodebt  savings 
      11        0       20 
> sapply(debt.edt, class)
 incomegp     house  children   singpar     agegp    manage    cigbuy   xmasbuy 
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
 locintrn   prodebt   savings 
"numeric" "numeric" "numeric" 
> table(apply(debt.edt,1,function(row) sum(is.na(row))))

  0   1   2   3   4 
355  46  15   1   2 

>  debt.edt = debt.edt[ apply(debt.edt,1, function(row) sum(is.na(row)))==0, ]
   dim(debt.edt)
[1] 355  11

> names(debt.edt)
 [1] "incomegp" "house"    "children" "singpar"  "agegp"    "manage"  
 [7] "cigbuy"   "xmasbuy"  "locintrn" "prodebt"  "savings" 

### Directly do the selection including up to pairwise interactions

> Dmod0 = lm(prodebt ~ ., data=debt.edt)
  Dmod1 = step(Dmod0, direction="backward", k=4)

> round(summary(Dmod1)$coef,5)
            Estimate Std. Error  t value Pr(>|t|)
(Intercept)  4.14424    0.24680 16.79166  0.00000
incomegp     0.11212    0.02599  4.31369  0.00002
agegp       -0.08095    0.03836 -2.11050  0.03552
manage      -0.13827    0.03988 -3.46709  0.00059
locintrn    -0.11294    0.04000 -2.82339  0.00502

> Dmod2 = step(Dmod1,scope=list(lower=Dmod1$formula, upper=prodebt ~ .^2), 
            direction="both", k=4)
> round(summary(Dmod2)$coef,5)
                Estimate Std. Error  t value Pr(>|t|)
(Intercept)      3.36340    0.39714  8.46903  0.00000
incomegp         0.38374    0.11175  3.43385  0.00067
agegp           -0.08562    0.03812 -2.24629  0.02531
manage           0.05659    0.08748  0.64698  0.51807
locintrn        -0.11577    0.03972 -2.91439  0.00379
incomegp:manage -0.06585    0.02636 -2.49803  0.01295

> c(AIC(Dmod0), AIC(Dmod1), AIC(Dmod2))
[1] 730.2010 727.7678 723.4765

> c(summary(Dmod0)$r.sq, summary(Dmod1)$r.sq, summary(Dmod2)$r.sq)
[1] 0.1485375 0.1252797 0.1406450

> c(cor(debt.edt$prodebt, Dmod0$fit), cor(debt.edt$prodebt, Dmod1$fit),
    cor(debt.edt$prodebt, Dmod2$fit))^2
[1] 0.1485375 0.1252797 0.1406450

> plot(debt.edt$prodebt, Dmod0$fit, xlab="Outcome", ylab="Fitted",
    main=paste("Overplotted Fitted Values, 3 Models \n",
              "using  debt  Data-frame in R Package faraway"))
 lines(lowess(debt.edt$prodebt, Dmod0$fit))
 lines(lowess(debt.edt$prodebt, Dmod1$fit), lty=3, col="red")
 lines(lowess(debt.edt$prodebt, Dmod2$fit),  lty=5, col="blue")
### Small differences between these "lowess" fitted lines

> plot(Dmod2)   #### looks OK, but can still check for nonlinear dependence on predictors
>  plot(debt.edt$agegp, Dmod2$resid)
>  plot(debt.edt$locintrn, Dmod2$resid)
>  plot(debt.edt$incomegp, Dmod2$resid)    ### These additional plots still show reasonable fit

##---------------------------------------------------------------

##(B)
> dim(esoph)
[1] 88  5
> names(esoph)
[1] "agegp"     "alcgp"     "tobgp"     "ncases"    "ncontrols"
> esoph[1:25,]
   agegp     alcgp    tobgp ncases ncontrols
1  25-34 0-39g/day 0-9g/day      0        40
2  25-34 0-39g/day    10-19      0        10
3  25-34 0-39g/day    20-29      0         6
4  25-34 0-39g/day      30+      0         5
5  25-34     40-79 0-9g/day      0        27
6  25-34     40-79    10-19      0         7
7  25-34     40-79    20-29      0         4
8  25-34     40-79      30+      0         7
9  25-34    80-119 0-9g/day      0         2
10 25-34    80-119    10-19      0         1
11 25-34    80-119      30+      0         2
12 25-34      120+ 0-9g/day      0         1
13 25-34      120+    10-19      1         1
14 25-34      120+    20-29      0         1
15 25-34      120+      30+      0         2
16 35-44 0-39g/day 0-9g/day      0        60
17 35-44 0-39g/day    10-19      1        14
18 35-44 0-39g/day    20-29      0         7
19 35-44 0-39g/day      30+      0         8
20 35-44     40-79 0-9g/day      0        35
21 35-44     40-79    10-19      3        23
22 35-44     40-79    20-29      1        14
23 35-44     40-79      30+      0         8
24 35-44    80-119 0-9g/day      0        11
25 35-44    80-119    10-19      0         6

> sapply(esoph,class)
$agegp
[1] "ordered" "factor" 

$alcgp
[1] "ordered" "factor" 

$tobgp
[1] "ordered" "factor" 

$ncases
[1] "numeric"

$ncontrols
[1] "numeric"

> contrasts(esoph$agegp)
             .L         .Q         .C         ^4          ^5
[1,] -0.5976143  0.5455447 -0.3726780  0.1889822 -0.06299408
[2,] -0.3585686 -0.1091089  0.5217492 -0.5669467  0.31497039
[3,] -0.1195229 -0.4364358  0.2981424  0.3779645 -0.62994079
[4,]  0.1195229 -0.4364358 -0.2981424  0.3779645  0.62994079
[5,]  0.3585686 -0.1091089 -0.5217492 -0.5669467 -0.31497039
[6,]  0.5976143  0.5455447  0.3726780  0.1889822  0.06299408

> tmp = factor(sample(1:6,20,replace=T))
>   tmp2 = C(tmp, poly)
>   contrasts(tmp2)
             .L         .Q         .C         ^4          ^5
[1,] -0.5976143  0.5455447 -0.3726780  0.1889822 -0.06299408
[2,] -0.3585686 -0.1091089  0.5217492 -0.5669467  0.31497039
[3,] -0.1195229 -0.4364358  0.2981424  0.3779645 -0.62994079
[4,]  0.1195229 -0.4364358 -0.2981424  0.3779645  0.62994079
[5,]  0.3585686 -0.1091089 -0.5217492 -0.5669467 -0.31497039
[6,]  0.5976143  0.5455447  0.3726780  0.1889822  0.06299408
   #### This type of "polynomial" contrasts is the default for an ordered factor of 6 levels
  ### and determines how the model matrix is filled.

> EsophMod1

Call:  glm(formula = cbind(ncases, ncontrols) ~ ., family = binomial, 
    data = esoph)

Coefficients:
(Intercept)      agegp.L      agegp.Q      agegp.C      agegp^4      agegp^5  
   -1.77997      3.00534     -1.33787      0.15307      0.06410     -0.19363  
    alcgp.L      alcgp.Q      alcgp.C      tobgp.L      tobgp.Q      tobgp.C  
    1.49185     -0.22663      0.25463      0.59448      0.06537      0.15679  

Degrees of Freedom: 87 Total (i.e. Null);  76 Residual
Null Deviance:      227.2 
Residual Deviance: 53.97        AIC: 225.5

> round(summary(EsophMod1)$coef,4)
            Estimate Std. Error z value Pr(>|z|)
(Intercept)  -1.7800     0.1980 -8.9916   0.0000
agegp.L       3.0053     0.6522  4.6083   0.0000
agegp.Q      -1.3379     0.5911 -2.2633   0.0236
agegp.C       0.1531     0.4485  0.3413   0.7329
agegp^4       0.0641     0.3088  0.2076   0.8356
agegp^5      -0.1936     0.1954 -0.9911   0.3216
alcgp.L       1.4919     0.1994  7.4836   0.0000
alcgp.Q      -0.2266     0.1795 -1.2624   0.2068
alcgp.C       0.2546     0.1591  1.6008   0.1094
tobgp.L       0.5945     0.1942  3.0608   0.0022
tobgp.Q       0.0654     0.1881  0.3475   0.7282
tobgp.C       0.1568     0.1866  0.8404   0.4007

### This suggests that we do not need quite so much detail on the ordered factor levels,
###   although all three are important.

> anova(EsophMod1)
Analysis of Deviance Table
Model: binomial, link: logit

Response: cbind(ncases, ncontrols)
Terms added sequentially (first to last)

      Df Deviance Resid. Df Resid. Dev
NULL                     87    227.241
agegp  5   88.128        82    139.112
alcgp  3   74.541        79     64.572
tobgp  3   10.599        76     53.973
   ### all highly significant except tobacco only slightly so. But the coefficients 
 ### suggest that perhaps only the overall fact of smoking is important.

> EsophMod2 = step(EsophMod1, scope=list(lower=cbind(ncases, ncontrols) ~ 1, 
         upper=cbind(ncases, ncontrols) ~ .^2), k=4)
 
Call:  glm(formula = cbind(ncases, ncontrols) ~ agegp + alcgp, family = binomial, 
    data = esoph)

Coefficients:
(Intercept)      agegp.L      agegp.Q      agegp.C      agegp^4      agegp^5  
    -1.9105       2.9179      -1.3333       0.1633       0.1432      -0.1869  
    alcgp.L      alcgp.Q      alcgp.C  
     1.5707      -0.2057       0.2583  

Degrees of Freedom: 87 Total (i.e. Null);  79 Residual
Null Deviance:      227.2 
Residual Deviance: 64.57        AIC: 230.1

### NOTE: slightly WORSE AIC (k=2) with this 2nd model, and no interactions!!

### NOW look at the possibility of lower-order nonlinearities with interactions!

> c(EsophMod1$null.dev-2, logLik(EsophMod1), logLik(EsophMod2))
[1]  225.2406 -100.7270 -106.0263
> 
> AIC(EsophMod1)
[1] 225.454
> logLik(EsophMod1)
'log Lik.' -100.727 (df=12)
> logLik(EsophMod2)
'log Lik.' -106.0263 (df=9)

> esoph2 = esoph
  esoph2$agegp = 20+10*as.numeric(esoph2$age)
  esoph2$alcgp = 40*as.numeric(esoph2$alcgp)-20
  esoph2$tobgp = 10*as.numeric(esoph2$tobgp)-5.5
  names(esoph2)[1:3] = c("age","alc","tob")
  esoph2 = cbind.data.frame(esoph2, alcsq = esoph2$alc^2, alc3rd = esoph2$alc^3,
                 tobsq = esoph2$tob^2, tob3rd = esoph2$tob^3)
> EsophMod3A = step(glm(cbind(ncases, ncontrols) ~ ., family = binomial, 
                        data = esoph2), direction="backward", k=4)
> EsophMod3A

Call:  glm(formula = cbind(ncases, ncontrols) ~ age + alc + tob, family = binomial, 
    data = esoph2)

Coefficients:
(Intercept)          age          alc          tob  
   -6.15542      0.05287      0.01735      0.02745  

Degrees of Freedom: 87 Total (i.e. Null);  84 Residual
Null Deviance:      227.2 
Residual Deviance: 73.96        AIC: 229.4
> logLik(EsophMod3A)
'log Lik.' -110.7197 (df=4)   #### Better log-likelihood, with far fewer parameters

> round(summary(EsophMod3A)$coef, 4)
            Estimate Std. Error  z value Pr(>|z|)
(Intercept)  -6.1554     0.5037 -12.2201    0e+00
age           0.0529     0.0072   7.3552    0e+00
alc           0.0173     0.0021   8.3174    0e+00
tob           0.0274     0.0081   3.3993    7e-04

> EsophMod3B = step(EsophMod3A, scope=list(upper=cbind(ncases, ncontrols) ~ (age+alc+tob)^2),
               direction="forward", k=4)
Start:  AIC=237.44
cbind(ncases, ncontrols) ~ age + alc + tob

          Df Deviance    AIC
<none>         73.959 237.44
+ alc:tob  1   70.852 238.33
+ age:alc  1   71.913 239.39
+ age:tob  1   73.951 241.43          #### So NO interaction terms !

### It might be that greater detail in age and the amounts of alcohol and tobacco might allow
###   nonlinear terms and interactions to be more significant.

### `Perfect' solution should do some model-checking, maybe graphical

> plot(1:88, cumsum(esoph2$ncases[order(EsophMod3A$fit)])/
       cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod3A$fit)]),
      type="l", xlab="Ordered Subjects", ylab="Cum Case-Rate", main=paste(
     "Overplotted Empirical and Predicted Cum Case-Rate \n",
     "for Subjects Ordered by  EsophMod3A  Score"))
  lines(1:88, cumsum(EsophMod3A$fit[order(EsophMod3A$fit)]*(esoph2$ncases+
              esoph2$ncontrols)[order(EsophMod3A$fit)])/ 
       cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod3A$fit)]),
              lty=3, col="blue")
#### This plotted blue line shows that there is a lot of improvement in this model, 
####   especially among the subjects with lowest predicted case-rate !!
                  
> plot(1:88, cumsum(esoph2$ncases[order(EsophMod1$fit)])/
       cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod1$fit)]),
      type="l", xlab="Ordered Subjects", ylab="Cum Case-Rate", main=paste(
     "Overplotted Empirical and Predicted Cum Case-Rate \n",
     "for Subjects Ordered by  EsophMod1  Score"))
  lines(1:88, cumsum(EsophMod1$fit[order(EsophMod1$fit)]*(esoph2$ncases+
              esoph2$ncontrols)[order(EsophMod1$fit)])/ 
       cumsum((esoph2$ncases+esoph2$ncontrols)[order(EsophMod1$fit)]),
              lty=3, col="blue")
### This model is MUCH better with respect to the graphical diagnostic, showing that
### parsimony comes at a price in this case !! LogLik does not always tell the whole story.