HW 17 Solution
--------------

a0= -1.5; b0 = 0.6; c0 = 1; alpha = 0.8

### To do this in a streamlined way, first create a function to take a set of R 
###  datasets of size n  pairs   (X_i, Y_i)  represented as R x n x 2 array) and 
calculate OLS intercepts a, regression slopes b and their SEs.

RegrEst = function(Dset) {
   n = dim(Dset)[2]
   R = dim(Dset)[1]
   Xbar = c(Dset[,,1] %*% rep(1/n,n))        ### same as:  apply(Dset[,,1],2,mean)
   X.ctr = Dset[,,1] - Xbar %o% rep(1,n)     ### R x n array of centered X[r,j]-Xbar[r]
   SXX = c(X.ctr^2 %*% rep(1,n))             ### same as:  (n-1)*apply(Data[,,1],2,var)
   Ybar = c(Dset[,,2] %*% rep(1/n,n))        ### same as:  apply(Data[,,2],2,mean)
   Ests = array(0, c(R,3), dimnames=list(NULL,c("a","bLs","bLsSE")))
   Ests[,2] = c( (Dset[,,2]*X.ctr) %*% rep(1,n) )/SXX  
   Ests[,1] = Ybar - Ests[,2]*Xbar 
   Yresid = (-Ybar + Dset[,,2]) - (Ests[,2]*X.ctr)
   Ests[,3] = sqrt((1/(n-2))*c( Yresid^2 %*% rep(1,n) )/SXX)
   Ests  }

## The output R x 3 matrix contains a column of intercept and slope OLS estimators 
##    and a column of their corresponding estimates SE's.

## We provide a second version of exactly the same function applicable in the special 
## case where  Dset[r,i,1] = Xv[i]   are the same for all  r. Then denote Dset[,,2] by let Yset.

RegrEst2 = function(Xv,Yset) {
   n = length(Xv)
   R = nrow(Yset)
   Xbar = mean(Xv)
   X.ctr = Xv - Xbar
   SXX = sum(X.ctr^2)  
   Ybar = c(Yset %*% rep(1/n,n)) 
   Ests = array(0, c(R,3), dimnames=list(NULL,c("a","bLs","bLsSE")))
   Ests[,2] = c(Yset %*% (X.ctr/SXX))
   Ests[,1] = Ybar - Ests[,2]*Xbar 
   Yresid = (-Ybar + Yset) - (Ests[,2] %o% X.ctr)
   Ests[,3] = sqrt((1/((n-2)*SXX))*c( Yresid^2 %*% rep(1,n) ))
   Ests  }

### NOTE that neither of these functions directly estimates c0: indeed, the row mean-square
### c( Yresid^2 %*% rep(1,n) )/(n-2)  is an estimate of the variance of the residuals, 
### which is  c0^2*(pi^2/3).    [Recall that Logistic(0,1) has variance pi^2/3.]

### So in the parametric bootstrap, we can do one of two things: 
###   (i) use the non-MLE estimates suggested in the problem,  
###   b0 estimated by bLs, a0 estimated by mean(Y)-bLs*mean(X), and c0 estimated as 
###    sqrt(sum(Yresid^2)/(m-2)*3/pi^2),   OR
###  (ii) use the MLEs based on the stated model.

### Steps 1. and 2.i -- with R=2000, n=65

set.seed(33779)
B = 2000; R=2000; n=65; alpha=0.2
Data = array(0, c(R,n,2), dimnames = list(NULL,NULL,c("X","Y")))
tmp = array(runif(R*n),c(n,R))
Data[,,1] = t(tmp)
Data[,,2] = t(array(a0 + b0*tmp+c0*rlogis(R*n), c(n,R)))   ### the original simulated datasets

> summary(c(Data[,,2]))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-12.30206  -2.29909  -1.19535  -1.19231  -0.08819  10.54997 
> summary(c(Data[,,1]))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0000015 0.2496155 0.4992102 0.4998092 0.7502565 0.9999917 

             ### Bootstrap cycles will be done for each column respectively of Data[,,1] and 
    ### of Data[,,2] which together constitute the set of X_i,Y_i for a single replcate dataset.

Ests.ini = RegrEst(Data)

### To see that the estimators are reasonable, consider:

> rbind(True=c(a0,b0), Est = apply(Ests.ini[,1:2],2,mean))
             a       bLs
True -1.500000 0.6000000
Est  -1.506264 0.6276679

> c(Emp.SE = sd(Ests.ini[,2]), AvgEst.SE = sqrt(mean(Ests.ini[,3]^2)))
   Emp.SE AvgEst.SE 
0.7950695 0.7854089      ### This says that the least-squares variance formula as calculated by
                     ### my Regr.Est function agrees with the sample s.d. of the bLs estimators
                     ### across 2000 samples.

> 0.8*1.96/sqrt(2000)
[1] 0.03506155       ### confirms that the difference between .6000 and .6277 above is reasonable.

### Finally compare true c0 = 1 with 
> c0.hat = sqrt((n-1)*apply(Data[,,1],1,var)*Ests.ini[,3]^2*3/pi^2)   ## use as c0.hat in PB
> summary(c0.hat)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.6974  0.9157  0.9876  0.9884  1.0566  1.3659 

### Further checking that the RegrEst function gives identical results to lm:

### Try this using lm, directly
> auxests = array(0, c(2000,3))
  for(i in 1:2000) {
         tmp = lm(Data[i,,2] ~ Data[i,,1])
         auxests[i,1:2] = tmp$coef
         auxests[i,3] = summary(tmp)$sig*sqrt(summary(tmp)$cov.unsc[2,2]) }

> rbind(summary(Ests.ini[,1]-auxests[,1]),  summary(Ests.ini[,2]-auxests[,2]),
        summary(Ests.ini[,3]-auxests[,3]))
              Min.       1st Qu.       Median          Mean      3rd Qu.         Max.
[1,] -4.884981e-15 -6.661338e-16 0.000000e+00 -4.159173e-17 6.661338e-16 5.329071e-15
[2,] -6.661338e-15 -6.661338e-16 4.440892e-16  4.178940e-16 1.530460e-15 7.549517e-15
[3,] -8.881784e-16 -1.110223e-16 0.000000e+00 -2.325917e-17 1.110223e-16 6.661338e-16
             ### OK, perfect agreement.

### Step 2.(ii) -- Recall that X stays fixed when Bootstrap samples are drawn in Parametric Bootstrap
###        but that in Nonparametric Bootstrap the indices i of pairs (X_i,Y_i) are resampled iid !!

BootCIs = array(0, c(R,2,2,2), dimnames=list(NULL,c("PB","NPB"),c("L", "U"),
           c("Non-Stud","Stud'ized")))   ### this is where we will store the R quadruples of CIs 
DsetNPB = replace(Data,T,0)    ### but in general must have first dimension = B, which here is =R

### recall:   c0.hat = sqrt((n-1)*apply(Data[,,1],1,var)*Ests.ini[,3]^2*3/pi^2)

system.time({ 
for (i in 1:R) {
          bhat = Ests.ini[i,2]
          ahat = Ests.ini[i,1]
          sigh = Ests.ini[i,3]
          Xv = Data[i,,1]
          DatPB = t(array(rlogis(B*n, ahat+bhat*Xv, c0.hat),c(n,B)))
          inds = sample(1:n, n*B, replace=T)
          PBests = RegrEst2(Xv, DatPB)[,-1]
       ### at this point have finished Parametric-Bootstrap and estimates
          DsetNPB[,,1] = t(array(Data[i,inds,1], c(n,B)))
          DsetNPB[,,2] = t(array(Data[i,inds,2], c(n,B)))
          NPBest = RegrEst(DsetNPB)[,-1]
       ### now also have Nonparametric Bootstrap and corresponding bLs and SE estimates
       ### Next compute the non-studentized PB and NPB intervals respectively
          BootCIs[i,1,,1] = 2*Ests.ini[i,2] - quantile(PBests[,1], prob=c(1-alpha/2, alpha/2))
          BootCIs[i,2,,1] = 2*Ests.ini[i,2] - quantile(NPBest[,1], prob=c(1-alpha/2, alpha/2))
       ### Conclude by computing respectivelt studentized PB and NPB intervals
          BootCIs[i,1,,2] = Ests.ini[i,2] - quantile((PBests[,1]-Ests.ini[i,2])/PBests[,2], 
                              prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3]
          BootCIs[i,2,,2] = Ests.ini[i,2] - quantile((NPBest[,1]-Ests.ini[i,2])/NPBest[,2], 
                              prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3]
          }  })
   user  system elapsed 
  61.90    7.85   70.03        ### Took only one minute with B=R=2000

> apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1]))
    Non-Stud Stud'ized
PB    0.7805    0.7895
NPB   0.7765    0.7805    ### compare using Monte Carlo SE sqrt(.8*.2/2000) = 0.00894

> apply(BootCIs,c(2,4), function(mat) mean(mat[,2]-mat[,1]))
    Non-Stud Stud'ized
PB  2.002640  2.019546
NPB 1.964507  1.999647   ### don't bother with variances because these are all so close.
                
### Re-do the entire analysis with B=R=10000    (different seed)

set.seed(33781)
B = 10000; R=10000; n=65; alpha=0.2
Data = array(0, c(R,n,2), dimnames = list(NULL,NULL,c("X","Y")))
tmp = array(runif(R*n),c(n,R))
Data[,,1] = t(tmp)
Data[,,2] = t(array(a0 + b0*tmp+c0*rlogis(R*n), c(n,R)))   ### the original simulated datasets

Ests.ini = RegrEst(Data)

BootCIs = array(0, c(R,2,2,2), dimnames=list(NULL,c("PB","NPB"),c("L", "U"),
           c("Non-Stud","Stud'ized")))   ### this is where we will store the R quadruples of CIs 
DsetNPB = replace(Data,T,0)    ### but in general must have first dimension = B, which here is =R

### recall:   c0.hat = sqrt((n-1)*apply(Data[,,1],1,var)*Ests.ini[,3]^2*3/pi^2)

system.time({ 
for (i in 1:R) {
          bhat = Ests.ini[i,2]
          ahat = Ests.ini[i,1]
          sigh = Ests.ini[i,3]
          Xv = Data[i,,1]
          DatPB = t(array(rlogis(B*n, ahat+bhat*Xv, c0.hat),c(n,B)))
          inds = sample(1:n, n*B, replace=T)
          PBests = RegrEst2(Xv, DatPB)[,-1]
       ### at this point have finished Parametric-Bootstrap and estimates
          DsetNPB[,,1] = t(array(Data[i,inds,1], c(n,B)))
          DsetNPB[,,2] = t(array(Data[i,inds,2], c(n,B)))
          NPBest = RegrEst(DsetNPB)[,-1]
       ### now also have Nonparametric Bootstrap and corresponding bLs and SE estimates
       ### Next compute the non-studentized PB and NPB intervals respectively
          BootCIs[i,1,,1] = 2*Ests.ini[i,2] - quantile(PBests[,1], prob=c(1-alpha/2, alpha/2))
          BootCIs[i,2,,1] = 2*Ests.ini[i,2] - quantile(NPBest[,1], prob=c(1-alpha/2, alpha/2))
       ### Conclude by computing respectivelt studentized PB and NPB intervals
          BootCIs[i,1,,2] = Ests.ini[i,2] - quantile((PBests[,1]-Ests.ini[i,2])/PBests[,2], 
                              prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3]
          BootCIs[i,2,,2] = Ests.ini[i,2] - quantile((NPBest[,1]-Ests.ini[i,2])/NPBest[,2], 
                              prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3]
          }  })
   user  system elapsed 
1507.97  156.74 1670.30       ###               25 minutes on 3.7GHz  dual-core Dell

> apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1]))   
    Non-Stud Stud'ized
PB    0.7994    0.8021
NPB   0.7926    0.7945   ####   compare using Monte Carlo SE sqrt(.8*.2/10000) = 0.004

> apply(BootCIs,c(2,4), function(mat) mean(mat[,2]-mat[,1]))
    Non-Stud Stud'ized
PB  2.004075  2.034110
NPB 1.987045  2.022846

BootCIs.old = BootCIs   ### to enable another run of the same type, B=R=10000

Data = array(0, c(R,n,2), dimnames = list(NULL,NULL,c("X","Y")))
tmp = array(runif(R*n),c(n,R))
Data[,,1] = t(tmp)
Data[,,2] = t(array(a0 + b0*tmp+c0*rlogis(R*n), c(n,R)))   ### the original simulated datasets

Ests.ini = RegrEst(Data)
### rest of loop not shown
   user  system elapsed 
1483.92  156.00 1651.00 


> apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1]))   
    Non-Stud Stud'ized
PB    0.7959    0.8021
NPB   0.7878    0.7898   #### 2nd run of R=10000
                        ### if averaged with previous one gives SE=.00283!!
> 0.5*(apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1]))  + 
         c( 0.7994, 0.8021, 0.7926, 0.7945 ) )
    Non-Stud Stud'ized
PB   0.79765   0.79735
NPB  0.79495   0.79215

> apply(BootCIs,c(2,4), function(mat) mean(mat[,2]-mat[,1]))
     Non-Stud Stud'ized
PB  2.005731  2.034696
NPB 1.984650  2.020119

##------------------------------------------------------------------------------------------
### From these runs so far, there seems to be no evidence of superiority of coverage 
###  (or of much systematic effect on length) of studentizing bootstrap confidence intervals
###  in the context of least-squares non-ML slope estimators.

### But we try one more simulation to see whether bootstrapping ML estimators and corresponding
###   studentized intervals do better.
### These last steps are definitely BEYOND THE SCOPE OF HW 17

## parameterize by a, b, log(c)

 NLlk = function(par, Xv,Yv) 
     - sum(log(dlogis(Yv,par[1]+par[2]*Xv, exp(par[3]))))

MLest = function(Dset, par.ini = c(a0,b0,log(c0))) {
   n = dim(Dset)[2]
   R = dim(Dset)[1]
   Ests = array(0, c(R,4), dimnames=list(NULL,c("a","b","c","bML.SE")))
   for (i in 1:R) {
         tmp = nlm(NLlk, par.ini, hessian=T, Xv=Dset[i,,1],Yv=Dset[i,,2])
         Ests[i,] = c(tmp$est[1:2],exp(tmp$est[3]), sqrt(solve(tmp$hess)[2,2]))    }
   Ests  }

## Recall 
> dim(Data)
[1] 10000    65     2

> MLest(Data[1:5,,])
>  MLest(Data[1:5,,])
             a         b         c    bML.SE
[1,] -1.298745 0.3338237 1.2229607 0.8999171
[2,] -1.264990 0.2818456 0.9476586 0.7598170
[3,] -1.337808 0.5829395 1.1770516 1.0985659
[4,] -1.761292 0.7542291 1.0729256 0.8154064
[5,] -1.449497 0.6329077 1.2137912 0.8249770

### OK, looks good, recall true param's are 
> c(a0,b0,c0)
[1] -1.5  0.6  1.0

Ests.ini2 = MLest(Data)
                          ### Now for Parametric & Nonparametric Bootstrap

B=R=10000
> BootCIs3 = array(0, c(R,2,2,2), dimnames=list(NULL,c("PB","NPB"),c("L", "U"),
+            c("Non-Stud","Stud'ized")))   ### this is where we will store the R quadruples of CIs 
> DsetNPB3 = replace(Data,T,0)    ### but in general must have first dimension = B, which here is =R
> dim(Data)
[1] 10000    65     2

DsetNPB3 = DsetPB3 = replace(Data,T,0)    ### must have first dimension = B, which here is =R
> system.time({ 
+ for (i in 1:R) {
+           bhat = Ests.ini2[i,2]
+           ahat = Ests.ini2[i,1]
+           chat = Ests.ini2[i,3]
+           Xv = Data[i,,1]
+           DsetPB3[,,1] = rep(1,B) %o% Xv
+           DsetPB3[,,2] = t(array(rlogis(B*n, ahat+bhat*Xv, chat),c(n,B)))
+           inds = sample(1:n, n*B, replace=T)
+           DsetNPB3[,,1] = t(array(Data[i,inds,1], c(n,B)))
+           DsetNPB3[,,2] = t(array(Data[i,inds,2], c(n,B)))
+           for(j in 1:B) {
+                 PBests =  MLest(DsetPB3)[,c(2,4)]
+             ### at this point have finished Parametric-Bootstrap and estimates
+                 NPBest = MLest(DsetNPB3)[,c(2,4)]
+        ### now also have Nonparametric Bootstrap and corresponding bLs and SE estimates
+        ### Next compute the non-studentized PB and NPB intervals respectively
+           BootCIs3[i,1,,1] = 2*Ests.ini2[i,2] - quantile(PBests[,1], prob=c(1-alpha/2, alpha/2))
+           BootCIs3[i,2,,1] = 2*Ests.ini2[i,2] - quantile(NPBest[,1], prob=c(1-alpha/2, alpha/2))
+        ### Conclude by computing respectivelt studentized PB and NPB intervals
+           BootCIs3[i,1,,2] = Ests.ini2[i,2] - quantile((PBests[,1]-Ests.ini2[i,2])/PBests[,2], 
+                               prob=c(1-alpha/2, alpha/2))*Ests.ini2[i,4]
+           BootCIs3[i,2,,2] = Ests.ini2[i,2] - quantile((NPBest[,1]-Ests.ini2[i,2])/NPBest[,2], 
+                               prob=c(1-alpha/2, alpha/2))*Ests.ini2[i,4]
+           }  }  })

[This one is still running: will include results when finished]