CLASS LOG FOR NOVEMBER 15, 2017 IN-CLASS DEMONSTRATION ON BOOTSTRAP CONFIDENCE INTRVALS ======================================================================================= > library(MASS) galaxies=galaxies/1000 hist(galaxies, nclass=18, prob=T) lines(density(galaxies), lty=3, col="red") LET US TAKE AS OUR GOAL TO PROVIDE CONFIDENCE INTERVAL FOR THE MEDIAN ### Picture suggests multimodal density, so let us take 3-component normal as parametric family > trimodnorm = function(x, par) { ### 8 parameters, since sum(pvec)=1 muvec = par[1:3]; sigvec = par[4:6]; pvec = c(par[7:8], 1-sum(par[7:8])) ### allow x also to be a vector of arbitrary dim d llkvec = log(c(cbind(dnorm(x,muvec[1],sigvec[1]),dnorm(x,muvec[2],sigvec[2]), dnorm(x,muvec[3],sigvec[3])) %*% pvec)) list(logLik=sum(llkvec), llkvec=llkvec) } ### for identifiability, enforce muvec entries in increasing order ### Par for optimization: muvec[1], log(muvec[2]-muvec[1]), log(muvec[3]-muvec[2]), ### log(sigvec), qlogis(pvec[1]), qlogis(pvec[2]/(1-pvec[1])) > par.trans = function(par) { pq = plogis(par[7:8]) c(par[1],par[1]+exp(par[2]), par[1]+exp(par[2])+exp(par[3]), exp(par[4:6]), pq[1], (1-pq[1])*pq[2], (1-pq[1])*(1-pq[2])) } > fitmnorm = nlm(function(par) -trimodnorm(galaxies,par.trans(par))$logLik, c(5,2,3,3,3,3,0,0), hessian=T) 1: In nlm(function(par) -trimodnorm(galaxies, par.trans(par))$logLik, : NA/Inf replaced by maximum positive value ### plus 4 other similar messages fitmnorm[-4] $minimum [1] 203.1792 $estimate [1] 9.71013982 2.45873020 2.45481501 -0.86154432 0.78597518 -0.08151702 [7] -2.37158470 3.17810562 $gradient [1] -3.062051e-06 -7.874475e-05 9.695023e-06 -3.617657e-06 1.974612e-05 [6] -6.504308e-07 -8.009686e-07 -1.477379e-07 $code [1] 1 $iterations [1] 58 ### Now translate the estimated parameter back to mu,sig, p > par0 = par.trans(fitmnorm$est) curve(exp(trimodnorm(x,par0)$llk), c(0,40), add=T, col="blue") ### We can see that this mixture of normals does a good job of fitting to the data, ### but it takes a lot of parameters ! ### In terms of the parametric model, the median is a function of parameters > trimodcdf = function(x, par) { ### 8 parameters, since sum(pvec)=1 muvec = par[1:3]; sigvec = par[4:6]; pvec = c(par[7:8], 1-sum(par[7:8])) ### allow x also to be a vector of arbitrary dim d c(cbind(pnorm(x,muvec[1],sigvec[1]),pnorm(x,muvec[2],sigvec[2]), pnorm(x,muvec[3],sigvec[3])) %*% pvec) } > medtrinorm = function(par, range) uniroot(function(x) trimodcdf(x,par)-0.5, range)$root > c(median(galaxies), medtrinorm(par0,c(0,50))) [1] 20.83350 21.24717 ### Next the (non-studentized) Parametric Bootstrap CI > set.seed(2215) ind = sample(1:3, 82*10000, replace=T, prob=par0[7:9]) PBgalx = array(rnorm(8.2e5, par0[ind], par0[3+ind]), c(1e4,82)) medPB = numeric(1e4) for(i in 1:1e4) { par1 = par.trans(nlm(function(par) -trimodnorm(PBgalx[i,], par.trans(par))$logLik, c(5,2,3,3,3,3,0,0))$est) medPB[i] = medtrinorm(par1,c(0,50)) } ### fairly quick !! > summary(medPB) Min. 1st Qu. Median Mean 3rd Qu. Max. 19.58 21.05 21.24 21.24 21.44 22.36 > hist(medPB, nclass=40, prob=T, xlab="Galaxy speed/1000", main= "Parametric Bootstrap Medians") a = mean(medPB); b = sd(medPB); xvals = seq(19.5,22.5, l=200) lines(xvals, dnorm(xvals,a,b), col="red") ### CI based on normal distribution: CI0 = medtrinorm(par0,c(0,50)) + c(-1,1)*1.96*b CI0 [1] 20.66646 21.82788 CI1 = medtrinorm(par0,c(0,50)) - quantile(medPB-medtrinorm(par0,c(0,50)), prob=c(.975,.025)) CI1 97.5% 2.5% 20.67314 21.83846 #### Now for Nonparametric Bootstrap: there are two different possible targets !! > NPBgalx = array(galaxies[sample(1:82, 82*10000, replace=T)], c(1e4,82)) medNPB = apply(NPBgalx,1,median) summary(medNPB) Min. 1st Qu. Median Mean 3rd Qu. Max. 19.86 20.52 20.83 20.87 21.06 22.50 > CI2 = median(galaxies) - quantile(medNPB-median(galaxies), prob=c(.975,.025)) CI2 97.5% 2.5% 19.6555 21.4945 > medNPB2 = numeric(1e4) for(i in 1:1e4) { par2 = par.trans(nlm(function(par) -trimodnorm(NPBgalx[i,], par.trans(par))$logLik, c(5,2,3,3,3,3,0,0))$est) medNPB2[i] = medtrinorm(par2,c(0,50)) } ### The second target is the parametrically defined median calculated by way ### of a fitted 3-component mixture of normals > summary(medNPB2) Min. 1st Qu. Median Mean 3rd Qu. Max. 19.50 20.71 21.10 21.03 21.38 24.17 > CI3 = medtrinorm(par0,c(0,50)) - quantile(medNPB2-medtrinorm(par0,c(0,50)), prob=c(.975,.025)) CI3 97.5% 2.5% 20.69948 22.36541 > array(rbind(CI0, CI1, CI2, CI3), c(4,2), dimnames=list(c("Norm.Thy", "PB","NPB1","NPB2"), c(".025",".975"))) .025 .975 Norm.Thy 20.66646 21.82788 PB 20.67314 21.83846 NPB1 19.65550 21.49450 NPB2 20.69948 22.36541 #### HOW CAN WE DISTINGUISH CONCEPTUALLY OR IN TERMS OF PERFORMANCE BETWEEN THESE DIFFERENT INTERVALS ??