Class Log on R Model Fitting With  lm  AND  glm  
=============================================== 10/25/17 and 10/30/17

### Following   /Rlogs/RlogF09.LinRegr.txt , started with   "steamdat"  and basics with  lm.

> steamdat = matrix(c(10.98, 35.3, 11.13, 29.7, 12.51, 30.8, 8.40, 58.8,
 9.27, 61.4, 8.73, 71.3, 6.36, 74.4, 8.50, 76.7, 7.82, 70.7, 9.14, 57.5, 
 8.24, 46.4, 12.19, 28.9, 11.88, 28.1, 9.57, 39.1, 10.94, 46.8, 9.58, 
 48.5, 10.09, 59.3, 8.11, 70.0, 6.83, 70.0, 8.88, 74.5, 7.68, 72.1,
 8.47, 58.1, 8.86, 44.6, 10.36, 33.4, 11.08, 28.6), ncol=2, byrow=T,
       dimnames=list(NULL,c("steamuse","temp")))
### 25   (y,x) pairs : Steam Use is the response

> lmtmp = lm(steamuse ~ . , data=data.frame(steamdat))

### Residual Sum of Squares: 3 ways

> c(sum((steamdat[,1]-lmtmp$fit)^2), sum(lmtmp$resid^2), summary(lmtmp)$sigma^2*23)
 18.2234 18.2234 18.2234
> lmtmp$df.residual
[1] 23

### logLikelihood done directly by first principles, by formula derived in class, and also by R function
 
> c(sum(log(dnorm(steamdat[,1],lmtmp$fit,sqrt(sum(lmtmp$resid^2)/25)))),
     -(25/2)*log(2*pi) - (25/2) -(25/2)*log(sum(lmtmp$resid^2)/25), logLik(lmtmp))
[1] -31.52135 -31.52135 -31.52135

### Next introduced   anova   table idea, log-likelihood calculation for linear models

> anova(lmtmp)

Analysis of Variance Table

Response: steamuse
          Df Sum Sq Mean Sq F value    Pr(>F)    
temp       1 45.592  45.592  57.543 1.055e-07 ***
Residuals 23 18.223   0.792            

> c(24*var(lmtmp$fit), sum(lmtmp$resid^2))
 45.5924 18.2234
 
### Rsq 2 different ways:
> c(summary(lmtmp)$r.sq, var(lmtmp$fit)/var(steamdat[,1]))
[1] 0.7144375 0.7144375


#### Enrich the understanding of the ANOVA table SSQ column in a case with more variables

> library(MASS)
> dim(Boston)
[1] 506  14             ### House prices ("medv") in terms of other variables
> housFit = lm(medv ~ ., data=Boston)
> anova(housFit)
Analysis of Variance Table

Response: medv
           Df  Sum Sq Mean Sq  F value    Pr(>F)    
crim        1  6440.8  6440.8 286.0300 < 2.2e-16 ***
zn          1  3554.3  3554.3 157.8452 < 2.2e-16 ***
indus       1  2551.2  2551.2 113.2984 < 2.2e-16 ***
chas        1  1529.8  1529.8  67.9393 1.543e-15 ***
nox         1    76.2    76.2   3.3861 0.0663505 .  
rm          1 10938.1 10938.1 485.7530 < 2.2e-16 ***
age         1    90.3    90.3   4.0087 0.0458137 *  
dis         1  1779.5  1779.5  79.0262 < 2.2e-16 ***
rad         1    34.1    34.1   1.5159 0.2188325    
tax         1   329.6   329.6  14.6352 0.0001472 ***
ptratio     1  1309.3  1309.3  58.1454 1.266e-13 ***
black       1   593.3   593.3  26.3496 4.109e-07 ***
lstat       1  2410.8  2410.8 107.0634 < 2.2e-16 ***
Residuals 492 11078.8    22.5        

### Check the  cumulative sums of SumSq against the variances from successive
> cumsum( anova(housFit)[,"Sum Sq"] )
 [1]  6440.783  9995.119 12546.356 14076.204 14152.451 25090.568 25180.836
 [8] 26960.337 26994.471 27324.025 28633.335 29226.672 31637.511 42716.295

> 505*c(var(housFit$fit), var(Boston$medv))
[1] 31637.51 42716.30

> 505*c( var(lm(medv ~ crim, data=Boston)$fit), var(lm(medv ~ crim+zn, data=Boston)$fit), 
        var(lm(medv ~ crim+zn+indus, data=Boston)$fit), var(lm(medv ~ crim+zn+indus+chas, data=Boston)$fit),  
		var(lm(medv ~ crim+zn+indus+chas+nox, data=Boston)$fit) )
[1]  6440.783  9995.119 12546.356 14076.204 14152.451

### AIC for final model with usual penalty term (= number of parameters)
> c(-2*(logLik(housFit) - (length(housFit$coef)+1)),  AIC(housFit))
[1] 3027.609 3027.609

### Now look at AIC comparison with model omitting   "lstat"

> tmp = update(housFit, formula= ~.-lstat)
> c(2*(logLik(housFit)-logLik(tmp)), AIC(housFit)-AIC(tmp))
[1]  99.62571 -97.62571

### Same comparison but with penalty term twice as big :

> AIC(housFit, k=4)
[1] 3057.609
> -2*logLik(housFit)
'log Lik.' 2997.609 (df=15)

### Now an automatic stepwise backward or forward-selection fitted model with AIC and k=10

> stepHous1 =  step(housFit, direction="backward", k=4)
Start:  AIC=1617.64
medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
    tax + ptratio + black + lstat

          Df Sum of Sq   RSS    AIC
- age      1      0.06 11079 1613.7
- indus    1      2.52 11081 1613.8
<none>                 11079 1617.6
- chas     1    218.97 11298 1623.5
- tax      1    242.26 11321 1624.6
- crim     1    243.22 11322 1624.6
- zn       1    257.49 11336 1625.3
- black    1    270.63 11349 1625.8
- rad      1    479.15 11558 1635.1
- nox      1    487.16 11566 1635.4
- ptratio  1   1194.23 12273 1665.4
- dis      1   1232.41 12311 1667.0
- rm       1   1871.32 12950 1692.6
- lstat    1   2410.84 13490 1713.3

Step:  AIC=1613.65
medv ~ crim + zn + indus + chas + nox + rm + dis + rad + tax + 
    ptratio + black + lstat

          Df Sum of Sq   RSS    AIC
- indus    1      2.52 11081 1609.8
<none>                 11079 1613.7
- chas     1    219.91 11299 1619.6
- tax      1    242.24 11321 1620.6
- crim     1    243.20 11322 1620.6
- zn       1    260.32 11339 1621.4
- black    1    272.26 11351 1621.9
- rad      1    481.09 11560 1631.2
- nox      1    520.87 11600 1632.9
- ptratio  1   1200.23 12279 1661.7
- dis      1   1352.26 12431 1667.9
- rm       1   1959.55 13038 1692.0
- lstat    1   2718.88 13798 1720.7

Step:  AIC=1609.76
medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + 
    black + lstat

          Df Sum of Sq   RSS    AIC
<none>                 11081 1609.8
- chas     1    227.21 11309 1616.0
- crim     1    245.37 11327 1616.8
- zn       1    257.82 11339 1617.4
- black    1    270.82 11352 1618.0
- tax      1    273.62 11355 1618.1
- rad      1    500.92 11582 1628.1
- nox      1    541.91 11623 1629.9
- ptratio  1   1206.45 12288 1658.0
- dis      1   1448.94 12530 1667.9
- rm       1   1963.66 13045 1688.3
- lstat    1   2723.48 13805 1717.0

> stepHous =  step(lm(medv~1, data=Boston), 
          ~crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat, 
		         direction="forward", k=4)
Start:  AIC=2248.51
medv ~ 1

          Df Sum of Sq   RSS    AIC
+ rm       1   20654.4 22062 1918.2
+ ptratio  1   11014.3 31702 2101.6
+ indus    1    9995.2 32721 2117.6
+ tax      1    9377.3 33339 2127.1
+ nox      1    7800.1 34916 2150.5
+ crim     1    6440.8 36276 2169.8
+ rad      1    6221.1 36495 2172.9
+ age      1    6069.8 36647 2175.0
+ zn       1    5549.7 37167 2182.1
+ black    1    4749.9 37966 2192.9
+ dis      1    2668.2 40048 2219.9
+ chas     1    1312.1 41404 2236.7
<none>                 42716 2248.5

Step:  AIC=1918.19
medv ~ rm

          Df Sum of Sq   RSS    AIC
+ ptratio  1    3320.3 18742 1839.7
+ tax      1    3290.8 18771 1840.5
+ black    1    2594.2 19468 1858.9
+ crim     1    2496.1 19566 1861.4
+ rad      1    2482.5 19579 1861.8
+ indus    1    2254.3 19808 1867.7
+ nox      1    2217.5 19844 1868.6
+ age      1    1997.0 20065 1874.2
+ zn       1     974.5 21087 1899.3
+ chas     1     538.5 21523 1909.7
+ dis      1     512.6 21549 1910.3
<none>                 22062 1918.2

...

Step:  AIC=1609.76
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
    crim + rad + tax

        Df Sum of Sq   RSS    AIC
<none>               11081 1609.8
+ indus  1   2.51754 11079 1613.7
+ age    1   0.06271 11081 1613.8

#### SAME FINAL MODEL !!!

##----------------------------------------------------------------
## Now continue with deviances, anova table, and stepwise fitting using glm

> library(faraway); data(diabetes)
> diabetes[[6]] = as.numeric(diabetes$glyhb > 7.0)
> names(diabetes)[6] = "diabetic";  diabetes = diabetes[,-1]
> sapply(diabetes, function(col) sum(is.na(col)))
    chol stab.glu      hdl    ratio diabetic location      age   gender 
       1        0        1        1       13        0        0        0        
  height   weight    frame    bp.1s    bp.1d    bp.2s    bp.2d    waist
       5        1       12        5        5      262      262        2
     hip time.ppn 
       2        3 

> diabetes = diabetes[,-15:16]
  diabetes = diabetes[apply(diabetes,1,function(row) sum(is.na(row)))==0,]
> dim(diabetes)
[1] 366  16

> maxmod = glm(diabetic ~ ., data=diabetes, family=binomial)
  minmod = glm(diabetic ~ 1, data=diabetes, family=binomial)

> logLik(minmod)
'log Lik.' -156.6067 (df=1)
> sum(log(dbinom(diabetes$diabetic,1,minmod$fit)))
[1] -156.6067
> c(AIC(minmod, k=4), -2*logLik(minmod))+4)
[1] 317.2134 317.2134
> minmod$coef
(Intercept) 
  -1.711221 
> plogis(minmod$coef)
(Intercept) 
  0.1530055 
> summary(minmod$lin)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -1.711  -1.711  -1.711  -1.711  -1.711  -1.711 
> summary(minmod$fit)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.153   0.153   0.153   0.153   0.153   0.153 

> logLik(maxmod)
'log Lik.' -77.78503 (df=17)   ### df = # coef's
> sum(log(dbinom(diabetes$diabetic,1,plogis(maxmod$lin))) )
[1] -77.78503
> c(AIC(minmod, k=4), -2*logLik(minmod)+4*summary(minmod)$df[1])
[1] 317.2134 317.2134

> minmod$formula
diabetic ~ 1
> maxmod$formula
diabetic ~ .

> bestmod = step(glm(diabetic ~ chol + age + waist, family=binomial, data=diabetes),
              scope=list(lower=minmod, upper=maxmod), direction="both", k=4)

Call:  glm(formula = diabetic ~ age + stab.glu, family = binomial, data = diabetes)

Coefficients:
(Intercept)          age     stab.glu  
   -7.80554      0.03673      0.03438  

Degrees of Freedom: 365 Total (i.e. Null);  363 Residual
Null Deviance:      313.2 
Residual Deviance: 170.1        AIC: 176.1

> c(-2*logLik(bestmod), bestmod$dev, bestmod$null.dev, -2*logLik(minmod))
[1] 170.1025 170.1025 313.2134 313.2134

     ### $dev is the deviance between "bestmod" and the "saturated model"
     ### $null.dev is  -2*logLik for the minimal model

> anova(bestmod)
Analysis of Deviance Table

Model: binomial, link: logit
Response: diabetic

Terms added sequentially (first to last)
         Df Deviance Resid. Df Resid. Dev
NULL                       365     313.21
age       1   29.779       364     283.44
stab.glu  1  113.332       363     170.10

> update(bestmod, formula = .~.-stab.glu)$dev
[1] 283.4347

> cbind(round(summary(bestmod)$coef[,1:3],4), summary(bestmod)$coef[,4])
            Estimate Std. Error z value             
(Intercept)  -7.8055     0.9366 -8.3339 7.820552e-17
age           0.0367     0.0133  2.7610 5.763177e-03
stab.glu      0.0344     0.0047  7.3438 2.075846e-13

> maxmod2 = glm(diabetic ~ .^2, family=binomial, data=diabetes)
> summary(maxmod2)$df
[1] 136 230 136

> bestmod2 = step(maxmod, scope=list(lower=minmod$formula, upper=maxmod2$formula), 
             direction="both", k=4)
  bestmod2

Call:  glm(formula = diabetic ~ chol + stab.glu + hdl + age + gender + 
    bp.1d + chol:hdl + stab.glu:gender, family = binomial, data = diabetes)

Coefficients:
          (Intercept)                   chol               stab.glu  
           -22.081230               0.059207               0.027798  
                  hdl                    age           genderfemale  
             0.212328               0.046210              -3.606137  
                bp.1d               chol:hdl  stab.glu:genderfemale  
             0.036178              -0.001129               0.035914  

Degrees of Freedom: 365 Total (i.e. Null);  357 Residual
Null Deviance:      313.2 
Residual Deviance: 143.3        AIC: 161.3

>  c(AIC(bestmod), AIC(bestmod2))
[1] 176.1025 161.3384

> cbind(round(summary(bestmod2)$coef[,1:3],4), summary(bestmod2)$coef[,4])
                      Estimate Std. Error z value             
(Intercept)           -22.0812     4.6421 -4.7567 1.967752e-06
chol                    0.0592     0.0169  3.5136 4.420469e-04
stab.glu                0.0278     0.0065  4.2500 2.137556e-05
hdl                     0.2123     0.0730  2.9079 3.638715e-03
age                     0.0462     0.0152  3.0335 2.417121e-03
genderfemale           -3.6061     1.5764 -2.2876 2.216007e-02
bp.1d                   0.0362     0.0175  2.0680 3.864110e-02
chol:hdl               -0.0011     0.0003 -3.3600 7.795586e-04
stab.glu:genderfemale   0.0359     0.0133  2.7076 6.778141e-03

### "Goodness of fit" is sometimes measured (cf anova-table!) by
> 1-pchisq(bestmod$dev, bestmod$df.resid)         ### 1, same for bestmod2

#### Pictures: residual plot, standardized residual plot for lm; binned plot for glm

> plot(stepHous)
Waiting to confirm page change...
Waiting to confirm page change...
Waiting to confirm page change...
Waiting to confirm page change...

### To calculate standardized residuals:

> mtmp = model.matrix(stepHous)
  sum(abs(summary(stepHous)$cov.unscaled - solve(t(mtmp)%*% mtmp)))
[1] 1.423795e-12       ### shows exactly how to get $cov.unscaled

  hat.matrix = mtmp %*% summary(stepHous)$cov.unsc %*% t(mtmp)
  res.sd = sqrt(diag(diag(nrow(hat.matrix))-hat.matrix))*summary(stepHous)$sigma
  stdized.resid = stepHous$resid/res.sd      ### standardized residuals

> plot(stepHous$fit, stdized.resid)   ### just like one of the standard plots ...

#### Now let's consider a "binned" plot to check the fit of the glm. The idea is
####   to sort the data into categories by quantile (say, deciles) of linear predictor
####   and then compare graphically the relative frequency of 1's within category
####   versus the predicted probability of 1's. We do this on the "diabetes" models.

> dim(diabetes)
[1] 366  16

> rnkbest = rank(bestmod$lin)
> table(rnkbest %/% 37)

 0  1  2  3  4  5  6  7  8  9 
36 37 37 37 37 37 37 37 37 34 

> bintab1 = round(data.matrix(aggregate(cbind(empir=diabetes$diabetic, pred=bestmod$fit), 
                               by=list(rnkbest %/% 37), mean)),4)
      Group.1  empir   pred
 [1,]       0 0.0278 0.0126
 [2,]       1 0.0000 0.0194
 [3,]       2 0.0000 0.0254
 [4,]       3 0.0000 0.0313
 [5,]       4 0.0000 0.0394
 [6,]       5 0.0270 0.0581
 [7,]       6 0.0811 0.0822
 [8,]       7 0.2432 0.1240
 [9,]       8 0.3514 0.3094
[10,]       9 0.8529 0.8838

> bintab2 = round(data.matrix(aggregate(cbind(empir=diabetes$diabetic, pred=bestmod2$fit), 
                           by=list(rank(bestmod2$lin) %/% 37), mean)),4)
      Group.1  empir   pred
 [1,]       0 0.0000 0.0017
 [2,]       1 0.0270 0.0053
 [3,]       2 0.0000 0.0104
 [4,]       3 0.0000 0.0161
 [5,]       4 0.0270 0.0258
 [6,]       5 0.0000 0.0431
 [7,]       6 0.1081 0.0775
 [8,]       7 0.0541 0.1476
 [9,]       8 0.4865 0.3346
[10,]       9 0.8824 0.9266

> plot(1:366, cumsum(diabetes$diabetic[order(bestmod$fit)]), type="l",
      xlab="Ordered Subjects", ylab="Cum Failures", main=paste(
     "Overplotted Empirical and Predicted Cumulative Failures \n",
     "for Subjects Ordered by  bestmod  Score"))
  lines(1:366, cumsum(bestmod$fit[order(bestmod$fit)]), lty=3, col="blue")

  legend(locator(), legend=c("Actual failures", "Predicted failures"), lty=c(1,3),
      col=c("black","blue"))

### saved as  "bestmodPic.pdf"

> plot(1:366, cumsum(diabetes$diabetic[order(bestmod2$fit)]), type="l",
      xlab="Ordered Subjects", ylab="Cum Failures", main=paste(
     "Overplotted Empirical and Predicted Cumulative Failures \n",
     "for Subjects Ordered by  bestmod2  Score"))
  lines(1:366, cumsum(bestmod2$fit[order(bestmod2$fit)]), lty=3, col="red")
  legend(locator(), legend=c("Actual failures", "Predicted failures"), lty=c(1,3),
      col=c("black","red"))

### saved as  "bestmod2Pic.pdf"