HW 14 Solutions
===============  11/10/16

nvec = c(21, 30, 28, 41, 52, 55, 47, 52, 80, 25)
Xvec = c(4, 7, 19, 22, 29, 5, 3, 10, 3, 4)

NlogL = function(x,n,a,b) sum(lbeta(a,b)-lbeta(x+a,n-x+b))

#--------------------------
1.(i) Minimize over a,b > 0

tmpfit = nlm(function(th) NlogL(Xvec,nvec,exp(th[1]),exp(th[2])),c(0,0), 
          hessian=T, steptol=1.e-8, gradtol=1.e-8)
$minimum
[1] 201.3282

$estimate
[1] 0.1706774 1.1401662

$gradient
[1] -1.705303e-07  2.492769e-08

$hessian
          [,1]      [,2]
[1,] 12.698914 -8.772443
[2,] -8.772443  9.962646

$code
[1] 1

$iterations
[1] 8

Here the estimates are
> ab = exp(tmpfit$est)
[1] 1.186108 3.127288
 
or another way:

> nlminb(c(1,1), function(th) NlogL(Xvec,nvec,th[1],th[2]), 
         lower=1.e-3)$par
[1] 1.186112 3.127299

and the coordinatewise standard errors of the logs are

> SEth= sqrt(diag(solve(tmpfit$hess)))
> SEth
[1] 0.4483591 0.5061997           ### VERY IMPRECISE
 
### So the confidence intervals for the parameters, 
###   obtained by exponentiating the confidence intervals on log scale, are:

> CI.a = exp(tmpfit$est[1]+c(-1,1)*1.96*SEth[1])
[1] 0.4925761 2.8561120
> CI.b = exp(tmpfit$est[2]+c(-1,1)*1.96*SEth[2])
[1] 1.159530 8.434392

### These could be replaced by the delta-method intervals on the original 
(not logarithmic) scale for a and b, but the log-parameter estimates seem 
more normal and the delta method does not seem well justified when the 
estimators are so imprecise.
### On the other hand, the log-estimators will themselves NOT turn to be 
close to normally distributed (the right tails are much too long), so these
intervals may well be less reasonable than those from parametric-bootstrap.

#----------------------------
1.(ii) Parametric bootstrap:

parr = array(rbeta(5000*10,ab[1],ab[2]), c(5000,10))
Xarr = array(rbinom(5000*10, rep(nvec,each=5000), c(parr)), c(5000,10))

### It is hard to get convergence in all cases without modifying parameters, e.g.:

abarr2 = t(apply(Xarr,1, function(xrow) {
              tmp = nlm(function(th) NlogL(xrow,nvec,
          exp(th[1]),exp(th[2])), tmpfit$est, stepmax = 10)
              c(tmp$est, tmp$code) } ))
> table(abarr2[,3])

   1    2   
4999    1       ### good convergence! achieved only with the stepmax and 
               ### log-parameterization. Needed to assess the convergence 
             ### in some way with the nlm convergence code to accomplish this.

quPB.a = quantile(abarr2[,1]-tmpfit$est[1],prob=c(.025,.975))
      2.5%      97.5% 
-0.7398754  1.5395751 

quPB.b = quantile(abarr2[,2]-tmpfit$est[2],prob=c(.025,.975))  
      2.5%      97.5% 
-0.7622082  1.7036624 

### So the parametric-bootstrap interval on original scale, based on 
###    MLE on log-scale, is:

> round(as.numeric(exp(tmpfit$est[1] - rev(quPB.a))),4)
[1] 0.2544 2.4857 
> round(as.numeric(exp(tmpfit$est[2] - rev(quPB.b))),4)
[1] 0.5692 6.7018       ### These intervals seem preferable 
                     ###  to those from normal theory

hist(abarr2[,1], nclass=32, prob=T)
## somewhat bell-shaped but definitely skewed right -- even on log-scale!

##-----------------
1.(iii)  Parametric bootstrap with different alternative "true" parameter near ab

parr3 = array(rbeta(5000*10,ab[1]+0.1,ab[2]+0.2), c(5000,10))
Xarr3 = array(rbinom(5000*10, rep(nvec,each=5000), c(parr3)), c(5000,10))

abarr3 = t(apply(Xarr3,1, function(xrow) {
              tmp = nlm(function(th) NlogL(xrow,nvec,
          exp(th[1]),exp(th[2])), log(ab+c(0.1,0.2)), stepmax = 10)
              c(tmp$est, tmp$code) } ))

table(abarr3[,3])   ### all convergence-codes are 1 

quPB3.a = quantile(abarr3[,1]-log(ab[1]+0.1),prob=c(.025,.975))

quPB3.b = quantile(abarr3[,2]-log(ab[2]+0.2),prob=c(.025,.975))  
> quPB3.a
      2.5%      97.5% 
-0.7318936  1.5987314 
> quPB3.b
     2.5%     97.5% 
-0.740609  1.775835      ### VERY CLOSE TO WHAT WE GOT BEFORE !!
 
Could try more examples to confirm the pattern, but two or three are enough.


parr4 = array(rbeta(5000*10,ab[1]+0.1,ab[2]-0.2), c(5000,10))
Xarr4 = array(rbinom(5000*10, rep(nvec,each=5000), c(parr3)), c(5000,10))

abarr4 = t(apply(Xarr4,1, function(xrow) {
              tmp = nlm(function(th) NlogL(xrow,nvec,
          exp(th[1]),exp(th[2])), log(ab+c(0.1,-0.2)), stepmax = 10)
              c(tmp$est, tmp$code) } ))

table(abarr4[,3])   ### amost all convergence-codes are 1, all are OK
   1    2    3 
4996    1    3 
    
quPB4.a = quantile(abarr4[,1]-log(ab[1]+0.1),prob=c(.025,.975))

quPB4.b = quantile(abarr4[,2]-log(ab[2]-0.2),prob=c(.025,.975))  
> quPB4.a
      2.5%      97.5% 
-0.7288883  1.5551947 
> quPB4.b
      2.5%      97.5% 
-0.6493738  1.8495368   ### Still a very similar pattern -- 
                       ###   the histograms look similar too.


#-----------------------------------
Part (2)(4 pts.)  each p[i] ~ Beta(X[i]+1, n[i]-X[i]+1)
so the posterior-expectation point estimators are  (X[i]+1)/(n[i]+2).

The credible intervals are (qbeta(.025,X[i]+1,n[i]-X[i]+1), qbeta(.975,X[i]+1,n[i]-X[i]+1)
exhibited in a matrix as:

round(rbind(lower = qbeta(.025,Xvec+1,nvec-Xvec+1),
            upper = qbeta(.975,Xvec+1,nvec-Xvec+1)), 3)
       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
lower 0.078 0.119 0.492 0.387 0.423 0.040 0.023 0.108 0.014 0.066
upper 0.403 0.411 0.821 0.680 0.684 0.196 0.172 0.320 0.104 0.349

#-----------------------------------------
Part (3) (2 pts.) 

# (a,b) = (0.5,2.5)
> round(rbind(lower = qbeta(.025,Xvec+0.5,nvec-Xvec+2.5),
            upper = qbeta(.975,Xvec+0.5,nvec-Xvec+2.5)), 3)
       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
lower 0.062 0.104 0.455 0.366 0.405 0.034 0.018 0.099 0.010 0.052
upper 0.362 0.382 0.787 0.656 0.665 0.182 0.154 0.304 0.094 0.315

# (a,b) = (2.5,0.5)
> round(rbind(lower = qbeta(.025,Xvec+2.5,nvec-Xvec+0.5),
            upper = qbeta(.975,Xvec+2.5,nvec-Xvec+0.5)), 3)
       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
lower 0.117 0.149 0.523 0.410 0.441 0.057 0.040 0.128 0.024 0.098
upper 0.461 0.451 0.840 0.699 0.699 0.226 0.209 0.346 0.128 0.402

## We can see that the 3 different choices of prior do lead to 
##   substantially different prediction intervals for the p_i's

#--------------------------------------------

Part (4) (2 pts. Extra.) 

One could try to evaluate the CI performance by doing a frequentist 
CI for p_i versus the Bayesian credible interval and looking at coverage 
proportion and average length for a large number of combinations of (a,b) 
choices in specifying the prior, along with choices (a0,b0) of parameters 
from which the simulated data are generated in each of R=1000 iterations. 

We already saw in parts (2) or (3) how to generate Bayesian credible 
intervals for p_i. But how do we generate frequentist CI for p_i ?
One proposal would be parametric-bootstrap!

Points given in this part: 1 pt. for a rough sketch of this idea to do a 
simulation study like this for fixed (a,b) for various choices (a0,b0), with 
coverage-indicator calculated in each replication r=1,...,R for each 
interval-type and each p_i.

Note that "coverage" means p_i in CI or credible interval, and "length" the 
(random!) length of these intervals, for each i and r and interval-type.

1 pts for realizing that the frequentist CI requires a new idea, which is sketched.
The  most natural way to do this given what we hae covered in the course is
parametric bootstrap.