Homework Solutions to Problem 26.
=================================  5/13/08

> library(MASS)
  rubberlm <- lm(loss ~ ., data=Rubber)
  rubberlm$coef
 (Intercept)     hard      tens 
    885.1611 -6.57083 -1.374312
### The simplest rule based on the linear regression
###   is to predict each (hard,tens) observation as above-median
###   if   885.1611-6.57083*hard -1.374312*tens > 165.
### One the original data this yields
> table(rubberlm$fit > 165, Rubber$loss > 165)
      FALSE TRUE 
FALSE    14    1
 TRUE     1   14   ### so only 2 are misclassified.

## The second method is logistic regression:
> Above = as.numeric(Rubber$loss > 165)
  rubberlgs = glm(cbind(Above, 1-Above) ~ hard+tens,
     data=Rubber, family=binomial)
  rubberlgs$coef
 (Intercept)       hard       tens 
    57.24511 -0.5125454 -0.1120316
> table(rubberlgs$fit > 0.5, Rubber$loss > 165)
      FALSE TRUE 
FALSE    14    2
 TRUE     1   13   ### now 3 are misclassified

## Third method: predict based on kernel NPR value > 165
> kerRubber <- outer(rubberlm$fit,rubberlm$fit, function(x,y) 
     dlogis(x,y,40))    ## 30x30 matrix:
  kerRubfit <- c(kerRubber %*% Rubber$loss)/c(kerRubber %*% rep(1,30)) 
> table(kerRubfit > 165, Rubber$loss > 165)
      FALSE TRUE 
FALSE    14    3
 TRUE     1   12   ### now 4 are misclassified

> RubbAlt = cbind.data.frame(Rubber, binvec=
      as.numeric(Rubber$loss > 165))
> Prdct3 = function(trainfram,testfram) {
    trainlm = lm(loss ~ hard+tens, data=trainfram)
    testlmfit = trainlm$coef[1]+c(data.matrix(testfram[,2:3]) %*%
       trainlm$coef[2:3])
    trainlgs =  glm(cbind(binvec, 1-binvec) ~ hard+tens,
       data=trainfram, family=binomial)
    testlgsc = trainlgs$coef[1]+c(data.matrix(testfram[,2:3]) %*%
       trainlgs$coef[2:3])
    kertrain = outer(testlmfit, trainlm$fit, function(x,y) 
       dlogis(x,y,40))  
    kertest = c(kertrain %*% trainfram$loss)/c(kertrain %*% 
       rep(1,nrow(trainfram)))
    list(lm.mis = sum( (testlmfit > 165) != (testfram$loss > 165)),
       glm.mis = sum( (testlgsc>0) != (testfram$loss > 165)),
       ker.mis = sum( (kertest > 165) != (testfram$loss > 165))) }
> unlist(Prdct3(RubbAlt,RubbAlt))
 lm.mis glm.mis ker.mis 
      2       3       4  ## Reproduces previous calculation
### This completes part (i). NOTE that these estimates of mis-
###  classification rates out of 30 will turn out to be too low!

NOTE: it turns out in further testing with this function that --
   as pointed out to me by Sam Younkin -- there are several 
   cases of cross-validation training samples in which the logistic
   fit is "unstable": these are fits in which the
   (not-quite-converged) models are trying to fit essentially 
   0-or-1 predictions but where there are still some 
   wrongly-predicted cases!

There were 50 warnings (use warnings() to see them)

> Misclass = array(0, dim=c(1000,3), dimnames=list(NULL, 
       c("lm.mis","glm.mis","ker.mis")))  
  for(i in 1:1000) {
     lvout = sample(30,5)
     lvin = setdiff(1:30,lvout)
     Misclass[i,] = unlist(Prdct3(RubbAlt[lvin,],RubbAlt[lvout,]))}
> apply(Misclass,2,sum)/5000
 lm.mis glm.mis ker.mis 
 0.1148  0.1142  0.1264   ### These are the estimated misclassifi-
   ### cation rates, suggesting that "linear" works best for these 
   ### data and that roughly one in 8 points are missclassified. 
   ### (Actually, we would need a considerably larger simulation 
   ### study to confirm this, and that is constrained by the remark 
   ### that {30 choose 5} = 142506 .


NOTE: UNLIKE THE WAY I TOLD MANY OF YOU TO DO THE PROBLEM, 
WHEN I WENT TO DO IT MYSELF I CHOSE TO RE-FIT THE LM AND GLM IN
EACH BOOTSTRAP ITERATION, AND I DID THE KERNEL METHOD BASED ON THE 
LINEAR MODEL FIT FROM THAT BOOTSTRAP ITERATION RATHER THAN FROM 
THE INITIAL DATA !

*** Now for bootstraps (part (iii)). The basis for this is that we
    have 3 `statistics' (to be spelled out now) for estimation
    of  P(loss > 165 | hard, tens)  based on a given data-set (data
    frame with columns  loss, hard, tens).
The statistics are: 
(a) from normal linear regression:
       1 - Phi ((165 - b0 - b1*hard - b2*tens)/sigmahat)
(b) from logistic regression:
       plogis( c0 + c1*hard + c2*tens )
(c) from kernel method (by analogy with NPR approach)
     sum_i I[loss_i > 165] * ker((x- b0 - b1*hard_i - b2*tens_i)/40)/     
           sum_i ker((x- b0 - b1*hard_i - b2*tens_i)/40)

These are the statistics we bootstrap to get confidence intervals.

> Estim3 = function(evalfram, bootfram) {
    bootlm = lm(loss ~ hard+tens, data=bootfram)
    evallmfit = bootlm$coef[1]+c(data.matrix(evalfram[,2:3]) %*%
       bootlm$coef[2:3])
    lmest = 1-pnorm((165 - evallmfit)/summary(bootlm)$sigma)
   ## binvec = as.numeric(bootfram$loss > 165)
    bootlgs =  glm(cbind(binvec, 1-binvec) ~ hard+tens,
       data=bootfram, family=binomial)
    glmest = plogis(bootlgs$coef[1]+c(data.matrix(evalfram[,2:3]) %*%
       bootlgs$coef[2:3]))
    kerboot = outer(evallmfit, bootlm$fit, function(x,y) 
       dlogis(x,y,40)) 
    kerest = c(kerboot %*% bootfram$binvec)/c(kerboot %*% 
       rep(1,nrow(bootfram)))
    list(lm.est = lmest, glm.est =glmest, ker.est = kerest) }

> Estim3(RubbAlt[1:5,],RubbAlt)
$lm.est:
[1] 0.99999998 0.85462905 0.50546810 0.19793889 0.04011242
$glm.est:
[1] 0.999999891 0.950345758 0.497125366 0.078551388 0.006529196
$ker.est:
[1] 0.9593606 0.6186340 0.4408652 0.3167944 0.2251193

### This completes the definition of the statistics. Now we bootstrap,
###   which means that the bootfram data will be generated by
###   the different with-replacement (nonparametric bootstrap) 
###   resampling strategies, while evalfram will always be Rubber[1:5,].

> Boot2mat = Boot1mat = array(0, dim=c(4000,5,3))

> for (i in 1:4000) 
     Boot1mat[i,,] = matrix(unlist(Estim3(RubbAlt[1:5,], RubbAlt[
        sample(30,30, replace=T),])), ncol=3)
### This was the simplest Bootstrap strategy: pure with-replacement
###     sampling of triples.

### Next consider what is probably the most appealing method for
###     bootstrapping a dataset which approximately satisfies a 
###     linear regression model with additive iid errors. We begin
###     by setting up the linear regression & residuals from the
###     original data, then bootstrapping (hard,tens) with replacement
###     and independently bootstrapping residuals with replacement.
> resids = rubberlm$resid
  yhats  = rubberlm$fit

> for (i in 1:4000) {
      inds1 = sample(30,30,replace=T)
      inds2 = sample(30,30,replace=T)
      Boot2mat[i,,] = matrix(unlist(Estim3(RubbAlt[1:5,], 
        cbind.data.frame(loss=yhats[inds1]+resids[inds2],
        RubbAlt[inds1,2:4]))), ncol=3) }
### In each of these loops, there were warnings (of exactly the 
###  same type as in the cross-validation steps) from the logistic-
###  regression  glm  function.

> M1 = round( apply(Boot1mat,2:3, function(x) quantile(x,.025)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 1.0000 0.8583
[2,] 0.6636 0.5107 0.4253
[3,] 0.2621 0.0000 0.2609
[4,] 0.0574 0.0000 0.1600
[5,] 0.0033 0.0000 0.0959

> M2 = round( apply(Boot1mat,2:3, function(x) quantile(x,.975)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 1.0000 0.9868
[2,] 0.9680 1.0000 0.7747
[3,] 0.6986 1.0000 0.6166
[4,] 0.3266 0.5820 0.4934
[5,] 0.0898 0.0686 0.3942

> M3 = round( apply(Boot2mat,2:3, function(x) quantile(x,.025)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 0.9999 0.8604
[2,] 0.6324 0.5483 0.3842
[3,] 0.2482 0.0000 0.2410
[4,] 0.0445 0.0000 0.1521
[5,] 0.0022 0.0000 0.0921

> M4 = round( apply(Boot2mat,2:3, function(x) quantile(x,.975)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 1.0000 0.9882
[2,] 0.9836 1.0000 0.8174
[3,] 0.7670 1.0000 0.6655
[4,] 0.3935 0.6294 0.5291
[5,] 0.1183 0.0698 0.4147

### Still have to make confidence intervals out of these quantiles 
  and the estimated quantities above (from the original dataset.)

### Of course, instead of bootstrapping the residuals we could have
    taken the parametric approach of simulating them as normally
    distributed with means 0 and variance summary(rubberlm)$sigma^2,
    but that is very similar and less appealing !

OK, for constructing CI's we focus on the function of the data which
results in the estimated probability P(loss > 165 | hard,
tens). Unfortunately, there is no nonparametric functional which gives
a consistent estimator, and this is not only a problem of the small
sample size (30), but also a problem of the type of (conditional!)
probability we are estimating. The main issue is that in a
nonparametric context, we have only the observations precisely at the
specific (hard,tens) value to help us estimate this conditional
probability. 
*** So I told you all that the functionals T  that we are estimating 
    are those indicated above (the one based on normal linear regression, 
    the one based on logistic regression, and the nonparametric kernel 
    regression using the linear combination of the hard and tens 
*** variables estimated from linear regression. 

### We complete this analysis by generating an array of confidence
    intervals, as follows: 5 points by 3 estimators by 2 bootstrap
    methods by (lower,upper) CI endpoints.

> Rubber[1:5,2:3]
  hard tens 
1   45  162
2   55  233
3   61  232
4   66  231
5   71  231

> CIarr = array(0, dim=c(5,3,2,2), dimnames=list(paste("Pt",
      1:5,sep=""), c("lin.reg","lgst.reg","ker.NPR"),
      c("Lower","Upper"), c("Bootstr.a", "Bootstr.c")))
  Ests = matrix(unlist(Estim3(RubbAlt[1:5,],RubbAlt)), ncol=3)
  Ests
           [,1]        [,2]      [,3] 
[1,] 0.99999998 0.999999891 0.9593606
[2,] 0.85462905 0.950345758 0.6186340
[3,] 0.50546810 0.497125366 0.4408652
[4,] 0.19793889 0.078551388 0.3167944
[5,] 0.04011242 0.006529196 0.2251193

> CIarr[,,1,1] = 2*Ests- M2    ### This is the nonparametric 
  CIarr[,,2,1] = 2*Ests- M1   ### bootstrap form of CI's
  CIarr[,,1,2] = 2*Ests- M4
  CIarr[,,2,2] = 2*Ests- M3

> round(pmin(pmax(CIarr,0),1),3)
  , , Lower, Bootstr.a
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   0.932
Pt2   0.741    0.901   0.463
Pt3   0.312    0.000   0.265
Pt4   0.069    0.000   0.140
Pt5   0.000    0.000   0.056

, , Upper, Bootstr.a
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   1.000
Pt2   1.000    1.000   0.812
Pt3   0.749    0.994   0.621
Pt4   0.338    0.157   0.474
Pt5   0.077    0.013   0.354

, , Lower, Bootstr.c
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   0.931
Pt2   0.726    0.901   0.420
Pt3   0.244    0.000   0.216
Pt4   0.002    0.000   0.104
Pt5   0.000    0.000   0.036

, , Upper, Bootstr.c
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   1.000
Pt2   1.000    1.000   0.853
Pt3   0.763    0.994   0.641
Pt4   0.351    0.157   0.481
Pt5   0.078    0.013   0.358