HW Set 17, Due Monday November 30, 2016. extended to Fri. Dec.2
-------------------------------------------------

(A) From HW14, we take

> nvec
 [1] 21 30 28 41 52 55 47 52 80 25

and, as our first choice of (a0,b0) pair for simulation,

ab0 = c(1.186108, 3.127288)

Now we do a first simulation with N=5000:

 set.seed(5753)
 pmat = array(rbeta(5e4, ab0[1], ab0[2]), c(10,5000))
 Xmat = array(rbinom(5e4, rep(nvec,5000), c(pmat)), c(10,5000))
 
### The method of frequentist parametric-bootstrap suggested in 
###  the problem is to generate only new pi*'s, not new (a*,b*)'s
###  as follows, with array-indices b=1:1000 for bootstrap, 
###  indices r=1:5000 for simulation replication, and i=1:10 for 
###  the cluster associated with nvec[i].

 Cover = Length = array(0,c(5000,10,2), dimnames=list(NULL,1:10,
         c("PBoot","Bayes")))
 Xboot.mat = array(rbinom(5.e7,rep(rep(nvec,5000),each=1000),
         rep(c(Xmat)/rep(nvec,5000),each=1000)), c(1000,10,5000))

## Note that "coverage always relates to pmat entries falling in 
##   confidence/prediction intervals.

## Here is the function I asked you to code generating bootstrap CI's
##    along with the corresponding Bayesian credible intervals.

> BootBayesCI = function(Xbmat,Xvec, nsiz, ab.work, alph=.05) {
     ## Xbmat is B x nclus, and Xvec and nsiz are numeric(nclus)
      q1 = alph/2
      q2 = 1-q1
      CI.Bayes = cbind(qbeta(q1,ab.work[1]+Xvec,ab.work[2]+nsiz-Xvec),
         qbeta(q2,ab.work[1]+Xvec,ab.work[2]+nsiz-Xvec))  ## nclus x 2
      CI.PBoot = 2*Xvec/nsiz - (1/nvec)*t(apply(Xbmat,2, function(col) 
         quantile(col, prob=c(q2,q1)))) 
      CI.PBoot[,1] = pmax(0, CI.PBoot[,1])
      CI.PBoot[,2] = pmin(1, CI.PBoot[,2])
      list(CI.Bayes=CI.Bayes,CI.PBoot=CI.PBoot)   }

## As an example of use of the function:

> aux =  BootBayesCI(Xboot.mat[,,5],Xmat[,5],nvec, ab0)
  cbind(aux$CI.Bayes, aux$CI.PBoot)
                               97.5%      2.5%
 [1,] 0.15980240 0.5133872 0.0952381 0.6190476
 [2,] 0.05392296 0.2865613 0.0000000 0.2000000
 [3,] 0.05751684 0.3029678 0.0000000 0.2142857
 [4,] 0.05400623 0.2492064 0.0000000 0.1707317
 [5,] 0.22392047 0.4682031 0.1730769 0.6153846
 [6,] 0.36661410 0.6180251 0.4909091 0.9277273
 [7,] 0.10238350 0.3168329 0.0000000 0.2978723
 [8,] 0.23961834 0.4868272 0.1923077 0.6538462
 [9,] 0.25977894 0.4626478 0.1875000 0.6378125
[10,] 0.08627991 0.3729608 0.0000000 0.3600000

#### Now comes the application of the function to the "simulation 
study" requested in the problem assignment, first with the working (a,b)
equal to the values  ab0  used to generate the data:

  for (r in 1:5000) {
      tmp = BootBayesCI(Xboot.mat[,,r],Xmat[,r],nvec,ab0)
      Cover[r,,2] = (pmat[,r]> tmp$CI.Bayes[,1])*
                    (pmat[,r]<= tmp$CI.Bayes[,2])
      Length[r,,2] = tmp$CI.Bayes[,2]-tmp$CI.Bayes[,1]
      Cover[r,,1] = (pmat[,r]> tmp$CI.PBoot[,1])*
                    (pmat[,r]<= tmp$CI.PBoot[,2])
      Length[r,,1] = tmp$CI.PBoot[,2]-tmp$CI.PBoot[,1]
  }
     
> round(apply(Cover,2:3,mean), 4)
    PBoot  Bayes
1  0.8122 0.9506
2  0.8422 0.9468
3  0.8444 0.9544
4  0.8572 0.9468
5  0.8794 0.9492
6  0.8838 0.9494
7  0.8764 0.9440
8  0.8770 0.9532
9  0.9078 0.9542
10 0.8264 0.9450
> round(apply(Length,2:3,mean), 4)
    PBoot  Bayes
1  0.2902 0.3019
2  0.2514 0.2579
3  0.2623 0.2673
4  0.2214 0.2246
5  0.1994 0.2013
6  0.1969 0.1981
7  0.2091 0.2111
8  0.1997 0.2014
9  0.1661 0.1665
10 0.2751 0.2797

### So the lengths of the CI's of the two methods are very similar, but the
Bayes CI's have perfectly nominal coverage of 95%, while the bootstrap 
coverage is  80-90% !!!?

### We re-run the study  with incorrect working (a,b)  of (1,2):

  for (r in 1:5000) {
      tmp = BootBayesCI(Xboot.mat[,,r],Xmat[,r],nvec,c(1,2))
      Cover[r,,2] = (pmat[,r]> tmp$CI.Bayes[,1])*
                    (pmat[,r]<= tmp$CI.Bayes[,2])
      Length[r,,2] = tmp$CI.Bayes[,2]-tmp$CI.Bayes[,1]
      Cover[r,,1] = (pmat[,r]> tmp$CI.PBoot[,1])*
                    (pmat[,r]<= tmp$CI.PBoot[,2])
      Length[r,,1] = tmp$CI.PBoot[,2]-tmp$CI.PBoot[,1]
  }


> round( cbind(Bayes.Cover = apply(Cover[,,2],2,mean), 
               Bayes.Length= apply(Length[,,2],2,mean)), 4)  
   Bayes.Cover Bayes.Length
1       0.9552       0.3086
2       0.9538       0.2619
3       0.9576       0.2719
4       0.9484       0.2272
5       0.9512       0.2032
6       0.9512       0.1998
7       0.9472       0.2132
8       0.9540       0.2033
9       0.9540       0.1675
10      0.9506       0.2850

### The alternative working (a,b)=(1,2) I chose led to over-coverage. Other 
choices (with a > a0 and b> b0) would have led to under-coverage.
##--------------------------------------------------------------
## NOTE: in my runs of this method, I found consistent under-coverage
##   by the Parametric bootstrap method. That is partly because I did 
not do anything in the bootstrap method to account for extra variability 
in the parametric bootstrap due to the Beta-distributed "random effect" at
the start. That is my fault from specifying in the HW assignment that the 
bootstrap be done this way.
   If you found a better way to do the bootstrap, that legitimately gave
closer-to-nominal coverage for the bootstrap method, you get full credit 
for this part of the HW problem plus an extra-credit point.
   The fully "correct" way, as I tried to tell in class last week, would 
be to do parametric bootstrap based on (a.hat, b.hat).

Next, with the same pmat and Xmat data, let us run a bootstrap simulation 
done the way it should be done according to bootstrap theory: first 
calculate an array of ab.hat (a 5000 x 2 array) MLEs, then sample in two 
stages from those as "true" parameters. The MLE steps are as in the solution
to HW 14:

NlogL = function(x,n,a,b) sum(lbeta(a,b)-lbeta(x+a,n-x+b))

abhat = array(0,c(5000,2))
condvec = numeric(5000)
for (r in 1:5000) {
     tmpfit = nlm(function(th) NlogL(Xmat[,r],nvec,exp(th[1]),exp(th[2])),
          log(ab0), stepmax=10, steptol=1.e-8, gradtol=1.e-8)
     condvec[r]=tmpfit$code
     abhat[r,] = exp(tmpfit$est) }

> table(condvec)
condvec
   1    2    3 
4722  267   11            ### OK, convergent even though ...

> summary(log(abhat))
       V1                 V2         
 Min.   :-1.42192   Min.   :-0.4806  
 1st Qu.:-0.01379   1st Qu.: 0.9482  
 Median : 0.32863   Median : 1.3317  
 Mean   : 0.42425   Mean   : 1.4203  
 3rd Qu.: 0.74468   3rd Qu.: 1.7689  
 Max.   :25.18433   Max.   :25.8429  

### ... some of the individual MLEs are very large positive,
> hist(abhat[,1]/(abhat[,1]+abhat[,2]), nclass=40)

## indicates that the ratio  a/(a+b)  has a very regular-looking distribution

### Now proceed to generate simulated array in two steps (using pbootmat 
##  in the first step to contain the bootstrap-generated p's). Note that
## we want the new data to be predictive samples for the old Xmat values,
## so we would want to sample pi* ~ Beta(Xi+a, ni-Xi+b) and then Xi* ~ Binom(ni,pi*)

set.seed(7729)
pbootmat = array(rbeta(5.e7, rep(abhat[,1], each=1e4)+rep(Xmat, each=1000),
           rep(abhat[,2],each=1e4)+rep(nvec-Xmat, each=1000)), c(1000,10,5000))

Xboot.mat = array(rbinom(5.e7,rep(rep(nvec,5000),each=1000), 
              pbootmat), c(1000,10,5000))

### this whole 2-step bootstrap simulation took about 1 minute on home laptop

### Now note: the quantities  Xboot.mat[b,i,r]/nvec[i] - pbootmat[b,i,r]  should
###   have dist'n approx same for all (b,i,r) as Xmat[i,r]/nvec[i] - pmat[i,r]
### Next create quantiles and bootstrap intervals:

r=115
round( cbind(Xmat[,r]/nvec - t(apply((1/nvec)*t(Xboot.mat[,,r])-
          t(pbootmat[,,r]),1, function(col) quantile(col, prob=c(.975,.025)))),
          pmat[,r]) ,4)


 for (r in 1:5000) {
      CI.PBoot = Xmat[,r]/nvec - t(apply((1/nvec)*t(Xboot.mat[,,r])-
          t(pbootmat[,,r]),1, function(col) quantile(col, prob=c(.975,.025)))) 
      CI.PBoot[,1] = pmax(0, CI.PBoot[,1])
      CI.PBoot[,2] = pmin(1, CI.PBoot[,2])
      Cover[r,,1] = (pmat[,r]> CI.PBoot[,1])*
                    (pmat[,r]<= CI.PBoot[,2])
      Length[r,,1] = CI.PBoot[,2]-CI.PBoot[,1]
  }
 
> round( cbind(PBoot.Cover = apply(Cover[,,1],2,mean), 
               PBoot.Length= apply(Length[,,1],2,mean),
               Bayes.Cover = apply(Cover[,,2],2,mean), 
               Bayes.Length= apply(Length[,,2],2,mean)), 4)  

   PBoot.Cover PBoot.Length Bayes.Cover Bayes.Length
1       0.9192       0.3089      0.9552       0.3086
2       0.9208       0.2634      0.9538       0.2619
3       0.9256       0.2737      0.9576       0.2719
4       0.9196       0.2291      0.9484       0.2272
5       0.9248       0.2049      0.9512       0.2032
6       0.9280       0.2018      0.9512       0.1998
7       0.9252       0.2149      0.9472       0.2132
8       0.9272       0.2052      0.9540       0.2033
9       0.9356       0.1691      0.9540       0.1675
10      0.9174       0.2862      0.9506       0.2850

### Still some undercoverage, but not too bad. The Bayes intervals
###   based on correct-ab0 and ab=(1,2) 
###   are much better for roughly the same length, but other 
###   choices with much worse (a,b) might not be ...

##-----------------------------------------------------------------

(B) 

library(MASS)
Insur=Insurance

> sapply(Insur[2:3],class)
     Group     Age      
[1,] "ordered" "ordered"
[2,] "factor"  "factor" 

#### Both "Group" and "Age" in the Insurance data-frame are "ordered factors"
### we convert our ordered factor columns of Insur  to unordered ones 

> Insur$Group = factor(Insur$Group, ordered=F)
  Insur$Age = factor(Insur$Age, ordered=F)

>   contrasts(Insur$Group)
       1-1.5l 1.5-2l >2l
<1l         0      0   0
1-1.5l      1      0   0
1.5-2l      0      1   0
>2l         0      0   1

## Run the forward stepwise selection with option k=2 in fitting a glm 
"Poisson regression" model for the number of Claims in the dataset 
"Insurance" which you can find in the library "MASS", using the log number 
of policy holders as an "offset", and allowing all variables in the dataset 
and their interactions up to second order as possible variables in the 
regression model.


> InsurFit = step(
     glm(Claims ~ . - Holders, data=Insur, family=poisson, offset=log(Holders)),
     direction="forward", scope= .~ .^2, k=2)

### None of the second-order interactions are included in the stepwise fitted model

> round(summary(InsurFit)$coef[,1:2],6)
             Estimate Std. Error
(Intercept) -1.821740   0.076788
District2    0.025868   0.043016
District3    0.038524   0.050512
District4    0.234205   0.061673
Group1-1.5l  0.161337   0.050532
Group1.5-2l  0.392810   0.054998
Group>2l     0.563412   0.072315
Age25-29    -0.191010   0.082856
Age30-35    -0.344951   0.081374
Age>35      -0.536671   0.069956

### Next we reproduce the fitted coef's and SE's from maximizing the logLik

> poisLLik = function(beta) {
        Xm = model.matrix(InsurFit)
        lam = Insur$Holders*exp(c(Xm %*% beta))
        Yv = Insur$Claims
        sum( Yv*log(lam) - lam - lgamma(1+Yv) ) }

> Newfit = nlm( function(beta) -poisLLik(beta), 
        rep(0,length(InsurFit$coef)), hessian=T)
> t(round( rbind(est=Newfit$est, SE=sqrt(diag(solve(Newfit$hess)))), 6))
            est       SE
 [1,] -1.821748 0.076784
 [2,]  0.025869 0.043014
 [3,]  0.038525 0.050509
 [4,]  0.234206 0.061670
 [5,]  0.161339 0.050530
 [6,]  0.392813 0.054995
 [7,]  0.563414 0.072312
 [8,] -0.191005 0.082852
 [9,] -0.344945 0.081370
[10,] -0.536665 0.069952 ### correct up to first 5 decimal places

> c(logLik(InsurFit), -NewFit$min)
[1] -184.3708 -184.3708  ### confirms that we reproduced the right logLik

### Now we calculate the null, residual, and saturated model values of -2*logLik
### Note that the intercept-term in the null model with offset is

> logLik(glm(Claims ~ 1, data=Insur, offset = Holders))
'log Lik.' -494.4621 (df=2)

> Hv = Insur$Holders
  Yv = Insur$Claims
  int0 = log(sum(Yv)/sum(Hv))
> (-2)*c(Null = -sum(Hv*exp(int0)) + 
             sum(Yv*log(Hv*exp(int0))) - sum(lgamma(1+Yv)), 
         Fitted = logLik(InsurFit),  Saturated = sum(-Yv) + 
             sum(Yv[Yv>0]*log(Yv[Yv>0])) - sum(lgamma(1+Yv)))
     Null    Fitted Saturated 
 553.5805  368.7416  317.3215 

### Then check that the null deviation and fitted-model deviations minus
###   the saturated-model deviation agree with the deviations given by R

> c(InsurFit$null.dev, InsurFit$dev)
[1] 236.25896  51.42003

> c("null.dev" = 553.5805-317.3215, "resid.dev"=368.7416-317.3215)
 null.dev resid.dev 
 236.2590   51.4201