HW 18 solutions due Monday 12/5 extended to Wednesday 12/7 ---------------------------------------------------------- > library(MASS) > pima = rbind(Pima.tr,Pima.te) > dim(pima) [1] 532 8 > names(pima) [1] "npreg" "glu" "bp" "skin" "bmi" "ped" "age" "type" #------------------------------------ (A) Find the best logistic regression model you can for the response variable "type" in terms of the other variables in this data-frame (and any re-codes and dummy variables and interactions you like). I suggest using "step" with a value k >= 4 (note that BIC would correspond to roughly k=6), using interactions of order at most 3 (e.g. bmi:skin:bp). > sapply(pima,class) npreg glu bp skin bmi ped age type "integer" "integer" "integer" "integer" "numeric" "numeric" "integer" "factor" > tmpf = glm(type ~ ., data=pima, family=binomial) > tmpf2 = step(tmpf, direction="both",k=6, scope=list(lower=type~1, upper = .~.^2)) > tmpf2$call glm(formula = type ~ npreg + glu + bmi + ped + glu:ped, family = binomial, data = pima) > tmpf3 = step(tmpf2, direction="forward", k=6, scope = .~. + npreg^2 + glu^2 + bmi^2) Start: AIC=499.27 type ~ npreg + glu + bmi + ped + glu:ped > tmpf3 = step(tmpf2, direction="forward", k=6, scope = .~(npreg+glu+bmi+ped)^3) Start: AIC=499.27 type ~ npreg + glu + bmi + ped + glu:ped #### So far, the best model seems to be this one. > anova(tmpf2) Analysis of Deviance Table Model: binomial, link: logit Response: type Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 531 676.79 npreg 1 33.100 530 643.69 glu 1 132.340 529 511.35 bmi 1 26.126 528 485.22 ped 1 14.926 527 470.30 glu:ped 1 7.027 526 463.27 > round(summary(tmpf2)$coef,5) Estimate Std. Error z value Pr(>|z|) (Intercept) -11.66105 1.24249 -9.38523 0.00000 npreg 0.17737 0.03498 5.07107 0.00000 glu 0.05234 0.00730 7.17383 0.00000 bmi 0.08912 0.01847 4.82545 0.00000 ped 4.91959 1.30991 3.75567 0.00017 glu:ped -0.02805 0.00960 -2.92156 0.00348 > rMSE = sqrt(mean((pima$type=="Yes" - tmpf2$fit)^2)) > rMSE [1] 0.3759087 > classErr = mean((pima$type=="Yes") != (tmpf2$fit>=0.5)) > classErr [1] 0.2105263 > table(pima$type, tmpf2$fit>=0.5) FALSE TRUE No 314 41 Yes 71 106 In part (A), stick with AIC. #------------------------------------ (B) In part (B), assess diabetic classification accuracy on various subgroups, defined by the proportion of women with diabetes, or by the average fitted probability of diabetes: (i) the 15 subgroups defined by glu in [50,59.99), [60,69.99), ..., [190,199.99) > hist(pima$glu, nclass=12) > glugp = (pima$glu-50) %/% 10 ### these groups are intervals of 10 > table(glugp) glugp 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 2 5 16 52 72 85 59 70 32 36 31 18 20 22 12 > round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(glugp=glugp), mean),5) glugp Diab pred 1 0 0.00000 0.01353 2 1 0.00000 0.04516 3 2 0.06250 0.06385 4 3 0.07692 0.10375 5 4 0.11111 0.11214 6 5 0.22353 0.18532 7 6 0.27119 0.21545 8 7 0.31429 0.35419 9 8 0.40625 0.36899 10 9 0.47222 0.50920 11 10 0.51613 0.68765 12 11 0.77778 0.72713 13 12 0.90000 0.80160 14 13 0.90909 0.83185 15 14 0.75000 0.84349 ### The model with glu does not do a good job of predicting the rate of ### diabetes among groups 3, 6, 7, 11-14, perhaps suggesting the need ### for a nonlinear term in glucose (but that turns out not to help). ### This Table shows that the actual [but not the predicted!] rates of diabetes ### are virtually the same for women with predicted rates within the intervals ### (0.1,0.3] and (0.3,0.5] but that the other narrower groups are well-predicted ! (ii) the subgroups defined by pmin(pima$npreg, 11) > round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(npreg=pmin(pima$npreg,11)), mean),5) npreg Diab pred 1 0 0.35065 0.31241 2 1 0.20690 0.21202 3 2 0.17722 0.21381 4 3 0.38596 0.33296 5 4 0.21951 0.30565 6 5 0.35484 0.32214 7 6 0.25000 0.35586 8 7 0.55172 0.55989 9 8 0.65000 0.56648 10 9 0.63158 0.57760 11 10 0.58333 0.48633 12 11 0.65217 0.67917 > table(cut(pima$npreg, breaks=c(-1,1,4,8,20))) (-1,1] (1,4] (4,8] (8,20] 193 177 108 54 > aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(npreg = cut(pima$npreg, breaks=c(-1,1,4,8,20))), function(x) round(mean(x),5)) npreg Diab pred 1 (-1,1] 0.26425 0.25207 2 (1,4] 0.25424 0.27345 3 (4,8] 0.43519 0.43997 4 (8,20] 0.62963 0.60058 ### This looks OK > aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(npreg = cut(pima$npreg, breaks=c(-1,1,3,6,9,20))), function(x) round(mean(x),5)) ### Another try can be based on a factor constructed from predicted diabetes rates ## according to the model tmpft2 falling in respective intervals, i.e., ## look at subgrpus defined by the predictions themselves ... > summary(tmpf2$fit) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.005586 0.085840 0.223500 0.332700 0.542600 0.978400 > round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(fitgp = tmpf2$fitted %/% 0.1), mean),5) fitgp Diab pred 1 0 0.03356 0.04962 2 1 0.17526 0.14732 3 2 0.19298 0.24338 4 3 0.47727 0.35001 5 4 0.44737 0.45007 6 5 0.46667 0.54651 7 6 0.63636 0.64345 8 7 0.72973 0.74812 9 8 0.86364 0.85426 10 9 0.92857 0.93726 ### Finally, display fit according to groups defined by age-intervals > round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(agegp = pmin((pima$age - 10) %/% 10, 5)), mean),5) agegp Diab pred 1 1 0.20195 0.23115 2 2 0.43925 0.41500 3 3 0.54795 0.47265 4 4 0.78125 0.66125 5 5 0.23077 0.45913 #------------------ Attempt further model fit ## It is hard to find a better simple model than tmpf2 above with the same variables: adding a nonlinear term for glucose or a factor-grouping for glu or npreg does not help. However, consider: > round(aggregate(cbind(Diab=(pima$type=="Yes"), pred=tmpf2$fitted), by=list(agegp = pmin((pima$age - 10) %/% 10, 5)), mean),5) agegp Diab pred 1 1 0.20195 0.23115 2 2 0.43925 0.41500 3 3 0.54795 0.47265 4 4 0.78125 0.66125 5 5 0.23077 0.45913 ## This suggests that we have to find a way to get the age-group into the model! > agegp = factor(pmin((pima$age - 10) %/% 10, 5)) pimaA = cbind.data.frame(pima, agegp=agegp) tmpf4 = update(tmpf2, data=pimaA, formula = .~. + agegp) > anova(tmpf4) Analysis of Deviance Table Model: binomial, link: logit Df Deviance Resid. Df Resid. Dev NULL 531 676.79 npreg 1 33.100 530 643.69 glu 1 132.340 529 511.35 bmi 1 26.126 528 485.22 ped 1 14.926 527 470.30 agegp 4 16.642 523 453.65 glu:ped 1 6.288 522 447.37 ### So this helps a bit: > c(extractAIC(tmpf2), extractAIC(tmpf4)) [1] 6.0000 475.2693 10.0000 467.3666 #------------------------------------ (C) Now assess classification accuracy by cross-validation (both models tmpf2, tmpf4) ## Estimate the probability of misclassification from your best fitted model ## on a new set of data by cross-validation, as follows: # 1. repeatedly, a total of N=1000 times, split the data randomly into a test # set of size 200 and the complementary training set of size 332 # 2. fit models tmpf2, tmpf4 for each data splitting, to training set; use coeff's # to predict from each model on test set; calculate numbers of misclassifications # defined by classifying pred >= 0.5 as Diabetes. # 3. Display misclassification rates. > tmpf2$call glm(formula = type ~ npreg + glu + bmi + ped + glu:ped, family = binomial, data = pima) > tmpf4$call glm(formula = type ~ npreg + glu + bmi + ped + agegp + glu:ped, family = binomial, data = pimaA) > set.seed(33) > sum(abs(tmpf2$fit - plogis(predict(tmpf2,pima)))) [1] 1.180132e-14 ### Confirms that "predict" applied to glm model gives glm score: ### must take plogis(.) of that score to get predicted probability of Diab > MisClass = array(0, c(1000,2), dimnames=list(NULL,c("tmpf2","tmpf4"))) Diab = as.numeric(pima$type=="Yes") for(j in 1:1000) { train = sample(532, 332) test = setdiff(1:532,train) tmpfit2 = update(tmpf2, subset=train) tmpfit4 = update(tmpf4, subset=train) MisClass[j,] = c( mean(Diab[test] != (plogis(predict(tmpfit2, pima[test,]))>=0.5)), mean(Diab[test] != (plogis(predict(tmpfit4, pimaA[test,]))>=0.5)) ) } > summary(MisClass) tmpf2 tmpf4 Min. :0.1300 Min. :0.1300 1st Qu.:0.2000 1st Qu.:0.1950 Median :0.2150 Median :0.2100 Mean :0.2152 Mean :0.2116 ### 22% or 21% cross-validated average 3rd Qu.:0.2300 3rd Qu.:0.2300 ### misclassification rate Max. :0.3100 Max. :0.2850 ## Compare these with the complete-data misclassification rates from each model ## (fitted to the same whole dataset > table(tmpf2$fit>=0.5, pima$type) No Yes FALSE 314 71 TRUE 41 106 > 112/532 [1] 0.2105263 > table(tmpf4$fit>=0.5, pima$type) No Yes FALSE 313 64 TRUE 42 113 > c(tmpf2.misclass = mean((tmpf2$fitted >= 0.5) != (pima$type=="Yes")), tmpf4.misclass = mean((tmpf4$fitted >= 0.5) != (pima$type=="Yes"))) tmpf2.misclass tmpf4.misclass 0.2105263 0.1992481 ## Would have expected the rate from models in in part (B) to be smaller than ## the cross-validated rate just calculated in part (C) because the latter uses ## different training and test data; testing on the training data inflates ## the percent correct. But the fdifference in this example is actually small. ## Some further confirmation: > MisClass = array(0, c(1000,2), dimnames=list(NULL,c("tmpf2","tmpf4"))) Diab = as.numeric(pima$type=="Yes") for(j in 1:1000) { train = sample(532, 332) tmpfit2 = update(tmpf2, subset=train) tmpfit4 = update(tmpf4, subset=train) MisClass[j,] = c( mean(Diab[train] != (tmpfit2$fitted>=0.5)), mean(Diab[train] != (tmpfit4$fitted>=0.5)) ) } > apply(MisClass,2,mean) tmpf2 tmpf4 0.2067018 0.1958645