HW 18 solutions due Monday 12/5 extended to Wednesday 12/7
----------------------------------------------------------

> library(MASS)
> pima = rbind(Pima.tr,Pima.te)
> dim(pima)
[1] 532   8
> names(pima)
[1] "npreg" "glu"   "bp"    "skin"  "bmi"   "ped"   "age"   "type" 
#------------------------------------

(A) Find the best logistic regression model you can for the response variable 
"type" in terms of the other variables in this data-frame (and any re-codes 
and dummy variables and interactions you like). I suggest using "step" with 
a value k >= 4 (note that BIC would correspond to roughly k=6), using 
interactions of order at most 3 (e.g. bmi:skin:bp).

> sapply(pima,class)
    npreg       glu        bp      skin       bmi       ped       age      type 
"integer" "integer" "integer" "integer" "numeric" "numeric" "integer"  "factor" 

> tmpf = glm(type ~ ., data=pima, family=binomial)

> tmpf2 = step(tmpf, direction="both",k=6, scope=list(lower=type~1, 
             upper = .~.^2))
> tmpf2$call
glm(formula = type ~ npreg + glu + bmi + ped + glu:ped, family = binomial, 
    data = pima)

>  tmpf3 = step(tmpf2, direction="forward", k=6, scope = .~. + 
                      npreg^2 + glu^2 + bmi^2)
Start:  AIC=499.27
type ~ npreg + glu + bmi + ped + glu:ped

> tmpf3 = step(tmpf2, direction="forward", k=6, scope = .~(npreg+glu+bmi+ped)^3)
Start:  AIC=499.27
type ~ npreg + glu + bmi + ped + glu:ped

#### So far, the best model seems to be this one.

> anova(tmpf2)
Analysis of Deviance Table

Model: binomial, link: logit

Response: type

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev
NULL                      531     676.79
npreg    1   33.100       530     643.69
glu      1  132.340       529     511.35
bmi      1   26.126       528     485.22
ped      1   14.926       527     470.30
glu:ped  1    7.027       526     463.27

> round(summary(tmpf2)$coef,5)
             Estimate Std. Error  z value Pr(>|z|)
(Intercept) -11.66105    1.24249 -9.38523  0.00000
npreg         0.17737    0.03498  5.07107  0.00000
glu           0.05234    0.00730  7.17383  0.00000
bmi           0.08912    0.01847  4.82545  0.00000
ped           4.91959    1.30991  3.75567  0.00017
glu:ped      -0.02805    0.00960 -2.92156  0.00348


> rMSE = sqrt(mean((pima$type=="Yes" - tmpf2$fit)^2))
> rMSE
[1] 0.3759087
> classErr = mean((pima$type=="Yes") != (tmpf2$fit>=0.5))
> classErr
[1] 0.2105263

> table(pima$type, tmpf2$fit>=0.5)
     
      FALSE TRUE
  No    314   41
  Yes    71  106

In part (A), stick with AIC.  

#------------------------------------

(B) In part (B), assess diabetic classification accuracy on various subgroups, 
defined by the proportion of women with diabetes, or by the average fitted 
probability of diabetes:

(i) the 15 subgroups defined by glu in [50,59.99), [60,69.99), ..., [190,199.99)

> hist(pima$glu, nclass=12)
> glugp = (pima$glu-50) %/% 10     ### these groups are intervals of 10
> table(glugp)
glugp
 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 
 2  5 16 52 72 85 59 70 32 36 31 18 20 22 12 

>  round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
                       by=list(glugp=glugp), mean),5)
   glugp    Diab    pred
1      0 0.00000 0.01353
2      1 0.00000 0.04516
3      2 0.06250 0.06385
4      3 0.07692 0.10375
5      4 0.11111 0.11214
6      5 0.22353 0.18532
7      6 0.27119 0.21545
8      7 0.31429 0.35419
9      8 0.40625 0.36899
10     9 0.47222 0.50920
11    10 0.51613 0.68765
12    11 0.77778 0.72713
13    12 0.90000 0.80160
14    13 0.90909 0.83185
15    14 0.75000 0.84349

### The model with glu  does not do a good job of predicting the rate of 
### diabetes among groups  3, 6, 7, 11-14, perhaps suggesting the need 
### for a nonlinear term in glucose (but that turns out not to help).

### This Table shows that the actual [but not the predicted!] rates of diabetes 
###  are virtually the same for women with predicted rates within the intervals  
### (0.1,0.3] and (0.3,0.5] but that the other narrower groups are well-predicted !

 
(ii) the subgroups defined by  pmin(pima$npreg, 11)

> round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
        by=list(npreg=pmin(pima$npreg,11)), mean),5)
   npreg    Diab    pred
1      0 0.35065 0.31241
2      1 0.20690 0.21202
3      2 0.17722 0.21381
4      3 0.38596 0.33296
5      4 0.21951 0.30565
6      5 0.35484 0.32214
7      6 0.25000 0.35586
8      7 0.55172 0.55989
9      8 0.65000 0.56648
10     9 0.63158 0.57760
11    10 0.58333 0.48633
12    11 0.65217 0.67917


> table(cut(pima$npreg, breaks=c(-1,1,4,8,20)))
(-1,1]  (1,4]  (4,8] (8,20] 
   193    177    108     54 

> aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
        by=list(npreg = cut(pima$npreg, breaks=c(-1,1,4,8,20))), 
        function(x) round(mean(x),5))
   npreg    Diab    pred
1 (-1,1] 0.26425 0.25207
2  (1,4] 0.25424 0.27345
3  (4,8] 0.43519 0.43997
4 (8,20] 0.62963 0.60058        ### This looks OK

> aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
        by=list(npreg = cut(pima$npreg, breaks=c(-1,1,3,6,9,20))), 
        function(x) round(mean(x),5))


### Another try can be based on a factor constructed from predicted diabetes rates 
##  according to the model   tmpft2   falling in respective intervals, i.e.,
##  look at subgrpus defined by the predictions themselves ...

>   summary(tmpf2$fit)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.005586 0.085840 0.223500 0.332700 0.542600 0.978400 

> round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
        by=list(fitgp = tmpf2$fitted %/% 0.1), mean),5)
   fitgp    Diab    pred
1      0 0.03356 0.04962
2      1 0.17526 0.14732
3      2 0.19298 0.24338
4      3 0.47727 0.35001
5      4 0.44737 0.45007
6      5 0.46667 0.54651
7      6 0.63636 0.64345
8      7 0.72973 0.74812
9      8 0.86364 0.85426
10     9 0.92857 0.93726

### Finally, display fit according to groups defined by age-intervals

> round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
        by=list(agegp = pmin((pima$age - 10) %/% 10, 5)), mean),5)
  agegp    Diab    pred
1     1 0.20195 0.23115
2     2 0.43925 0.41500
3     3 0.54795 0.47265
4     4 0.78125 0.66125
5     5 0.23077 0.45913

#------------------ Attempt further model fit

## It is hard to find a better simple model than tmpf2 above with the same 
variables: adding a nonlinear term for glucose or a factor-grouping for glu 
or npreg does  not help. However, consider:

> round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), 
        by=list(agegp = pmin((pima$age - 10) %/% 10, 5)), mean),5)
  agegp    Diab    pred
1     1 0.20195 0.23115
2     2 0.43925 0.41500
3     3 0.54795 0.47265
4     4 0.78125 0.66125
5     5 0.23077 0.45913

## This suggests that we have to find a way to get the age-group into the model!

>  agegp = factor(pmin((pima$age - 10) %/% 10, 5))
   pimaA = cbind.data.frame(pima, agegp=agegp)
   tmpf4 = update(tmpf2, data=pimaA, formula = .~. + agegp)
> anova(tmpf4)
Analysis of Deviance Table
Model: binomial, link: logit

        Df Deviance Resid. Df Resid. Dev
NULL                      531     676.79
npreg    1   33.100       530     643.69
glu      1  132.340       529     511.35
bmi      1   26.126       528     485.22
ped      1   14.926       527     470.30
agegp    4   16.642       523     453.65
glu:ped  1    6.288       522     447.37

### So this helps a bit:

> c(extractAIC(tmpf2), extractAIC(tmpf4))
[1]   6.0000 475.2693  10.0000 467.3666


#------------------------------------

(C) Now assess classification accuracy by cross-validation (both models tmpf2, tmpf4)


## Estimate the probability of misclassification from your best fitted model 
## on a new set of data by cross-validation, as follows:

# 1. repeatedly, a total of N=1000 times, split the data randomly into a test 
# set of size 200 and the complementary training set of size 332

# 2. fit models tmpf2, tmpf4 for each data splitting, to training set; use coeff's
# to predict from each model on test set; calculate numbers of misclassifications
# defined by classifying pred >= 0.5 as Diabetes.

# 3. Display misclassification rates. 

> tmpf2$call
glm(formula = type ~ npreg + glu + bmi + ped + glu:ped, family = binomial, 
    data = pima)
> tmpf4$call
glm(formula = type ~ npreg + glu + bmi + ped + agegp + glu:ped, 
    family = binomial, data = pimaA)

> set.seed(33)
> sum(abs(tmpf2$fit - plogis(predict(tmpf2,pima))))
[1] 1.180132e-14
       ### Confirms that "predict" applied to glm model gives glm score:
  ### must take plogis(.) of that score to get predicted probability of Diab

> MisClass = array(0, c(1000,2), dimnames=list(NULL,c("tmpf2","tmpf4")))
  Diab = as.numeric(pima$type=="Yes")
  for(j in 1:1000) {
       train = sample(532, 332)
       test = setdiff(1:532,train)
       tmpfit2 = update(tmpf2, subset=train)
       tmpfit4 = update(tmpf4, subset=train)
       MisClass[j,] = c( 
           mean(Diab[test] != (plogis(predict(tmpfit2, pima[test,]))>=0.5)),
           mean(Diab[test] != (plogis(predict(tmpfit4, pimaA[test,]))>=0.5)) )
  } 

> summary(MisClass)
      tmpf2            tmpf4       
 Min.   :0.1300   Min.   :0.1300  
 1st Qu.:0.2000   1st Qu.:0.1950  
 Median :0.2150   Median :0.2100  
 Mean   :0.2152   Mean   :0.2116    ### 22% or 21% cross-validated average
 3rd Qu.:0.2300   3rd Qu.:0.2300    ###    misclassification rate
 Max.   :0.3100   Max.   :0.2850  

## Compare these with the complete-data misclassification rates from each model 
## (fitted to the same whole dataset
> table(tmpf2$fit>=0.5, pima$type)
       
         No Yes
  FALSE 314  71
  TRUE   41 106
> 112/532
[1] 0.2105263

> table(tmpf4$fit>=0.5, pima$type)
       
         No Yes
  FALSE 313  64
  TRUE   42 113

> c(tmpf2.misclass = mean((tmpf2$fitted >= 0.5) != (pima$type=="Yes")),
    tmpf4.misclass = mean((tmpf4$fitted >= 0.5) != (pima$type=="Yes")))
tmpf2.misclass tmpf4.misclass 
     0.2105263      0.1992481 

## Would have expected the rate from models in in part (B) to be smaller than 
## the cross-validated rate just calculated in part (C) because the latter uses
## different training and test data; testing on the training data inflates 
## the percent correct. But the fdifference in this example is actually small.
## Some further confirmation:

> MisClass = array(0, c(1000,2), dimnames=list(NULL,c("tmpf2","tmpf4")))
  Diab = as.numeric(pima$type=="Yes")
  for(j in 1:1000) {
       train = sample(532, 332)
       tmpfit2 = update(tmpf2, subset=train)
       tmpfit4 = update(tmpf4, subset=train)
       MisClass[j,] = c( 
           mean(Diab[train] != (tmpfit2$fitted>=0.5)),
           mean(Diab[train] != (tmpfit4$fitted>=0.5)) )
  } 

> apply(MisClass,2,mean)
    tmpf2     tmpf4 
0.2067018 0.1958645