HW 17 Solution -------------- a0= -1.5; b0 = 0.6; c0 = 1; alpha = 0.8 ### To do this in a streamlined way, first create a function to take a set of R ### datasets of size n pairs (X_i, Y_i) represented as R x n x 2 array) and calculate OLS intercepts a, regression slopes b and their SEs. RegrEst = function(Dset) { n = dim(Dset)[2] R = dim(Dset)[1] Xbar = c(Dset[,,1] %*% rep(1/n,n)) ### same as: apply(Dset[,,1],2,mean) X.ctr = Dset[,,1] - Xbar %o% rep(1,n) ### R x n array of centered X[r,j]-Xbar[r] SXX = c(X.ctr^2 %*% rep(1,n)) ### same as: (n-1)*apply(Data[,,1],2,var) Ybar = c(Dset[,,2] %*% rep(1/n,n)) ### same as: apply(Data[,,2],2,mean) Ests = array(0, c(R,3), dimnames=list(NULL,c("a","bLs","bLsSE"))) Ests[,2] = c( (Dset[,,2]*X.ctr) %*% rep(1,n) )/SXX Ests[,1] = Ybar - Ests[,2]*Xbar Yresid = (-Ybar + Dset[,,2]) - (Ests[,2]*X.ctr) Ests[,3] = sqrt((1/(n-2))*c( Yresid^2 %*% rep(1,n) )/SXX) Ests } ## The output R x 3 matrix contains a column of intercept and slope OLS estimators ## and a column of their corresponding estimates SE's. ## We provide a second version of exactly the same function applicable in the special ## case where Dset[r,i,1] = Xv[i] are the same for all r. Then denote Dset[,,2] by let Yset. RegrEst2 = function(Xv,Yset) { n = length(Xv) R = nrow(Yset) Xbar = mean(Xv) X.ctr = Xv - Xbar SXX = sum(X.ctr^2) Ybar = c(Yset %*% rep(1/n,n)) Ests = array(0, c(R,3), dimnames=list(NULL,c("a","bLs","bLsSE"))) Ests[,2] = c(Yset %*% (X.ctr/SXX)) Ests[,1] = Ybar - Ests[,2]*Xbar Yresid = (-Ybar + Yset) - (Ests[,2] %o% X.ctr) Ests[,3] = sqrt((1/((n-2)*SXX))*c( Yresid^2 %*% rep(1,n) )) Ests } ### NOTE that neither of these functions directly estimates c0: indeed, the row mean-square ### c( Yresid^2 %*% rep(1,n) )/(n-2) is an estimate of the variance of the residuals, ### which is c0^2*(pi^2/3). [Recall that Logistic(0,1) has variance pi^2/3.] ### So in the parametric bootstrap, we can do one of two things: ### (i) use the non-MLE estimates suggested in the problem, ### b0 estimated by bLs, a0 estimated by mean(Y)-bLs*mean(X), and c0 estimated as ### sqrt(sum(Yresid^2)/(m-2)*3/pi^2), OR ### (ii) use the MLEs based on the stated model. ### Steps 1. and 2.i -- with R=2000, n=65 set.seed(33779) B = 2000; R=2000; n=65; alpha=0.2 Data = array(0, c(R,n,2), dimnames = list(NULL,NULL,c("X","Y"))) tmp = array(runif(R*n),c(n,R)) Data[,,1] = t(tmp) Data[,,2] = t(array(a0 + b0*tmp+c0*rlogis(R*n), c(n,R))) ### the original simulated datasets > summary(c(Data[,,2])) Min. 1st Qu. Median Mean 3rd Qu. Max. -12.30206 -2.29909 -1.19535 -1.19231 -0.08819 10.54997 > summary(c(Data[,,1])) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000015 0.2496155 0.4992102 0.4998092 0.7502565 0.9999917 ### Bootstrap cycles will be done for each column respectively of Data[,,1] and ### of Data[,,2] which together constitute the set of X_i,Y_i for a single replcate dataset. Ests.ini = RegrEst(Data) ### To see that the estimators are reasonable, consider: > rbind(True=c(a0,b0), Est = apply(Ests.ini[,1:2],2,mean)) a bLs True -1.500000 0.6000000 Est -1.506264 0.6276679 > c(Emp.SE = sd(Ests.ini[,2]), AvgEst.SE = sqrt(mean(Ests.ini[,3]^2))) Emp.SE AvgEst.SE 0.7950695 0.7854089 ### This says that the least-squares variance formula as calculated by ### my Regr.Est function agrees with the sample s.d. of the bLs estimators ### across 2000 samples. > 0.8*1.96/sqrt(2000) [1] 0.03506155 ### confirms that the difference between .6000 and .6277 above is reasonable. ### Finally compare true c0 = 1 with > c0.hat = sqrt((n-1)*apply(Data[,,1],1,var)*Ests.ini[,3]^2*3/pi^2) ## use as c0.hat in PB > summary(c0.hat) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.6974 0.9157 0.9876 0.9884 1.0566 1.3659 ### Further checking that the RegrEst function gives identical results to lm: ### Try this using lm, directly > auxests = array(0, c(2000,3)) for(i in 1:2000) { tmp = lm(Data[i,,2] ~ Data[i,,1]) auxests[i,1:2] = tmp$coef auxests[i,3] = summary(tmp)$sig*sqrt(summary(tmp)$cov.unsc[2,2]) } > rbind(summary(Ests.ini[,1]-auxests[,1]), summary(Ests.ini[,2]-auxests[,2]), summary(Ests.ini[,3]-auxests[,3])) Min. 1st Qu. Median Mean 3rd Qu. Max. [1,] -4.884981e-15 -6.661338e-16 0.000000e+00 -4.159173e-17 6.661338e-16 5.329071e-15 [2,] -6.661338e-15 -6.661338e-16 4.440892e-16 4.178940e-16 1.530460e-15 7.549517e-15 [3,] -8.881784e-16 -1.110223e-16 0.000000e+00 -2.325917e-17 1.110223e-16 6.661338e-16 ### OK, perfect agreement. ### Step 2.(ii) -- Recall that X stays fixed when Bootstrap samples are drawn in Parametric Bootstrap ### but that in Nonparametric Bootstrap the indices i of pairs (X_i,Y_i) are resampled iid !! BootCIs = array(0, c(R,2,2,2), dimnames=list(NULL,c("PB","NPB"),c("L", "U"), c("Non-Stud","Stud'ized"))) ### this is where we will store the R quadruples of CIs DsetNPB = replace(Data,T,0) ### but in general must have first dimension = B, which here is =R ### recall: c0.hat = sqrt((n-1)*apply(Data[,,1],1,var)*Ests.ini[,3]^2*3/pi^2) system.time({ for (i in 1:R) { bhat = Ests.ini[i,2] ahat = Ests.ini[i,1] sigh = Ests.ini[i,3] Xv = Data[i,,1] DatPB = t(array(rlogis(B*n, ahat+bhat*Xv, c0.hat),c(n,B))) inds = sample(1:n, n*B, replace=T) PBests = RegrEst2(Xv, DatPB)[,-1] ### at this point have finished Parametric-Bootstrap and estimates DsetNPB[,,1] = t(array(Data[i,inds,1], c(n,B))) DsetNPB[,,2] = t(array(Data[i,inds,2], c(n,B))) NPBest = RegrEst(DsetNPB)[,-1] ### now also have Nonparametric Bootstrap and corresponding bLs and SE estimates ### Next compute the non-studentized PB and NPB intervals respectively BootCIs[i,1,,1] = 2*Ests.ini[i,2] - quantile(PBests[,1], prob=c(1-alpha/2, alpha/2)) BootCIs[i,2,,1] = 2*Ests.ini[i,2] - quantile(NPBest[,1], prob=c(1-alpha/2, alpha/2)) ### Conclude by computing respectivelt studentized PB and NPB intervals BootCIs[i,1,,2] = Ests.ini[i,2] - quantile((PBests[,1]-Ests.ini[i,2])/PBests[,2], prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3] BootCIs[i,2,,2] = Ests.ini[i,2] - quantile((NPBest[,1]-Ests.ini[i,2])/NPBest[,2], prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3] } }) user system elapsed 61.90 7.85 70.03 ### Took only one minute with B=R=2000 > apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1])) Non-Stud Stud'ized PB 0.7805 0.7895 NPB 0.7765 0.7805 ### compare using Monte Carlo SE sqrt(.8*.2/2000) = 0.00894 > apply(BootCIs,c(2,4), function(mat) mean(mat[,2]-mat[,1])) Non-Stud Stud'ized PB 2.002640 2.019546 NPB 1.964507 1.999647 ### don't bother with variances because these are all so close. ### Re-do the entire analysis with B=R=10000 (different seed) set.seed(33781) B = 10000; R=10000; n=65; alpha=0.2 Data = array(0, c(R,n,2), dimnames = list(NULL,NULL,c("X","Y"))) tmp = array(runif(R*n),c(n,R)) Data[,,1] = t(tmp) Data[,,2] = t(array(a0 + b0*tmp+c0*rlogis(R*n), c(n,R))) ### the original simulated datasets Ests.ini = RegrEst(Data) BootCIs = array(0, c(R,2,2,2), dimnames=list(NULL,c("PB","NPB"),c("L", "U"), c("Non-Stud","Stud'ized"))) ### this is where we will store the R quadruples of CIs DsetNPB = replace(Data,T,0) ### but in general must have first dimension = B, which here is =R ### recall: c0.hat = sqrt((n-1)*apply(Data[,,1],1,var)*Ests.ini[,3]^2*3/pi^2) system.time({ for (i in 1:R) { bhat = Ests.ini[i,2] ahat = Ests.ini[i,1] sigh = Ests.ini[i,3] Xv = Data[i,,1] DatPB = t(array(rlogis(B*n, ahat+bhat*Xv, c0.hat),c(n,B))) inds = sample(1:n, n*B, replace=T) PBests = RegrEst2(Xv, DatPB)[,-1] ### at this point have finished Parametric-Bootstrap and estimates DsetNPB[,,1] = t(array(Data[i,inds,1], c(n,B))) DsetNPB[,,2] = t(array(Data[i,inds,2], c(n,B))) NPBest = RegrEst(DsetNPB)[,-1] ### now also have Nonparametric Bootstrap and corresponding bLs and SE estimates ### Next compute the non-studentized PB and NPB intervals respectively BootCIs[i,1,,1] = 2*Ests.ini[i,2] - quantile(PBests[,1], prob=c(1-alpha/2, alpha/2)) BootCIs[i,2,,1] = 2*Ests.ini[i,2] - quantile(NPBest[,1], prob=c(1-alpha/2, alpha/2)) ### Conclude by computing respectivelt studentized PB and NPB intervals BootCIs[i,1,,2] = Ests.ini[i,2] - quantile((PBests[,1]-Ests.ini[i,2])/PBests[,2], prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3] BootCIs[i,2,,2] = Ests.ini[i,2] - quantile((NPBest[,1]-Ests.ini[i,2])/NPBest[,2], prob=c(1-alpha/2, alpha/2))*Ests.ini[i,3] } }) user system elapsed 1507.97 156.74 1670.30 ### 25 minutes on 3.7GHz dual-core Dell > apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1])) Non-Stud Stud'ized PB 0.7994 0.8021 NPB 0.7926 0.7945 #### compare using Monte Carlo SE sqrt(.8*.2/10000) = 0.004 > apply(BootCIs,c(2,4), function(mat) mean(mat[,2]-mat[,1])) Non-Stud Stud'ized PB 2.004075 2.034110 NPB 1.987045 2.022846 BootCIs.old = BootCIs ### to enable another run of the same type, B=R=10000 Data = array(0, c(R,n,2), dimnames = list(NULL,NULL,c("X","Y"))) tmp = array(runif(R*n),c(n,R)) Data[,,1] = t(tmp) Data[,,2] = t(array(a0 + b0*tmp+c0*rlogis(R*n), c(n,R))) ### the original simulated datasets Ests.ini = RegrEst(Data) ### rest of loop not shown user system elapsed 1483.92 156.00 1651.00 > apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1])) Non-Stud Stud'ized PB 0.7959 0.8021 NPB 0.7878 0.7898 #### 2nd run of R=10000 ### if averaged with previous one gives SE=.00283!! > 0.5*(apply(BootCIs,c(2,4), function(mat) mean(b0 < mat[,2] & b0 > mat[,1])) + c( 0.7994, 0.8021, 0.7926, 0.7945 ) ) Non-Stud Stud'ized PB 0.79765 0.79735 NPB 0.79495 0.79215 > apply(BootCIs,c(2,4), function(mat) mean(mat[,2]-mat[,1])) Non-Stud Stud'ized PB 2.005731 2.034696 NPB 1.984650 2.020119 ##------------------------------------------------------------------------------------------ ### From these runs so far, there seems to be no evidence of superiority of coverage ### (or of much systematic effect on length) of studentizing bootstrap confidence intervals ### in the context of least-squares non-ML slope estimators. ### But we try one more simulation to see whether bootstrapping ML estimators and corresponding ### studentized intervals do better. ### These last steps are definitely BEYOND THE SCOPE OF HW 17 ## parameterize by a, b, log(c) NLlk = function(par, Xv,Yv) - sum(log(dlogis(Yv,par[1]+par[2]*Xv, exp(par[3])))) MLest = function(Dset, par.ini = c(a0,b0,log(c0))) { n = dim(Dset)[2] R = dim(Dset)[1] Ests = array(0, c(R,4), dimnames=list(NULL,c("a","b","c","bML.SE"))) for (i in 1:R) { tmp = nlm(NLlk, par.ini, hessian=T, Xv=Dset[i,,1],Yv=Dset[i,,2]) Ests[i,] = c(tmp$est[1:2],exp(tmp$est[3]), sqrt(solve(tmp$hess)[2,2])) } Ests } ## Recall > dim(Data) [1] 10000 65 2 > MLest(Data[1:5,,]) > MLest(Data[1:5,,]) a b c bML.SE [1,] -1.298745 0.3338237 1.2229607 0.8999171 [2,] -1.264990 0.2818456 0.9476586 0.7598170 [3,] -1.337808 0.5829395 1.1770516 1.0985659 [4,] -1.761292 0.7542291 1.0729256 0.8154064 [5,] -1.449497 0.6329077 1.2137912 0.8249770 ### OK, looks good, recall true param's are > c(a0,b0,c0) [1] -1.5 0.6 1.0 Ests.ini2 = MLest(Data) ### Now for Parametric & Nonparametric Bootstrap B=R=10000 > BootCIs3 = array(0, c(R,2,2,2), dimnames=list(NULL,c("PB","NPB"),c("L", "U"), + c("Non-Stud","Stud'ized"))) ### this is where we will store the R quadruples of CIs > DsetNPB3 = replace(Data,T,0) ### but in general must have first dimension = B, which here is =R > dim(Data) [1] 10000 65 2 DsetNPB3 = DsetPB3 = replace(Data,T,0) ### must have first dimension = B, which here is =R > system.time({ + for (i in 1:R) { + bhat = Ests.ini2[i,2] + ahat = Ests.ini2[i,1] + chat = Ests.ini2[i,3] + Xv = Data[i,,1] + DsetPB3[,,1] = rep(1,B) %o% Xv + DsetPB3[,,2] = t(array(rlogis(B*n, ahat+bhat*Xv, chat),c(n,B))) + inds = sample(1:n, n*B, replace=T) + DsetNPB3[,,1] = t(array(Data[i,inds,1], c(n,B))) + DsetNPB3[,,2] = t(array(Data[i,inds,2], c(n,B))) + for(j in 1:B) { + PBests = MLest(DsetPB3)[,c(2,4)] + ### at this point have finished Parametric-Bootstrap and estimates + NPBest = MLest(DsetNPB3)[,c(2,4)] + ### now also have Nonparametric Bootstrap and corresponding bLs and SE estimates + ### Next compute the non-studentized PB and NPB intervals respectively + BootCIs3[i,1,,1] = 2*Ests.ini2[i,2] - quantile(PBests[,1], prob=c(1-alpha/2, alpha/2)) + BootCIs3[i,2,,1] = 2*Ests.ini2[i,2] - quantile(NPBest[,1], prob=c(1-alpha/2, alpha/2)) + ### Conclude by computing respectivelt studentized PB and NPB intervals + BootCIs3[i,1,,2] = Ests.ini2[i,2] - quantile((PBests[,1]-Ests.ini2[i,2])/PBests[,2], + prob=c(1-alpha/2, alpha/2))*Ests.ini2[i,4] + BootCIs3[i,2,,2] = Ests.ini2[i,2] - quantile((NPBest[,1]-Ests.ini2[i,2])/NPBest[,2], + prob=c(1-alpha/2, alpha/2))*Ests.ini2[i,4] + } } }) [This one is still running: will include results when finished]