R Script for Power Calculations, Lec.15 ======================================== E. Slud 3/11/21 Computations with Power calculation, and computation of p-value, using Parametric and Nonparametric Bootstrap, and verification with Monte Carlo We carry over some of the coded functions from RscriptLec13, and modify some parts of them. TWilc = function(wvec,yvec) { m = length(wvec) n = length(yvec) T = (mean(yvec)-mean(wvec))/sqrt(var(wvec)/m+var(yvec)/n) Wilc = mean(outer(wvec, yvec, function(w,y) w= val)) Stat.TWilc = TWilc(wvec,yvec) c(pCalc(bootout[,1],Stat.TWilc[1]), pCalc(bootout[,2],Stat.TWilc[2])) [1] 0 0 ## The p-values depend importantly on how many Bootstrap replications # were done. Here the p-values of 0 mean that in 1000 Bootstraps none of # the reclaculated statistics was as large as the entries of Stat.TWilc apply(bootout,2,max) T Wilc 3.084511 0.673750 ## so the estimated p-values were < .001 #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Now do a Monte Carlo versus Bootstrap calculation of power #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # The Monte Carlo estimate is `correct' to within simulation error, # estimating from Rep=1000 distinct datasets of size 80, # while the Bootstrap uses only one `real' sample and # repeatedly re-samples it. PowCalc = function(wvec, yvec, B=1000, Delta=c(0,1)) { ### first entry of Delta must be 0 m = length(wvec) n = length(yvec) wbar = mean(wvec) ybar = mean(yvec) yinds = m+(1:n) d = length(Delta) bootarr = cbind(array(sample(wvec, B*m, replace=T), c(B,m)), array(sample(yvec, B*n, replace=T), c(B,n))) outstats = array(0, c(B,d,2)) for(i in 1:d) for(b in 1:B) outstats[b,i,] = TWilc(bootarr[b,1:m]-wbar, bootarr[b,yinds]-ybar+Delta[i]) cmat = apply(outstats[,1,],2,quantile, prob=c(.025,.975)) apply( array(outstats - rep(cmat[1,],rep(B*d,2)) < 0,c(B,d,2)) | array(outstats - rep(cmat[2,],rep(B*d,2)) > 0,c(B,d,2)), 2:3, mean) } > PowCalc(wvec,yvec,Delta=(0:5)/5) T M-W-Wilc [1,] 0.050 0.049 ## Rows correspond to Delta = 0, 0.2, 0.4,...,1 [2,] 0.085 0.109 [3,] 0.210 0.283 [4,] 0.398 0.489 [5,] 0.612 0.692 [6,] 0.814 0.865 ## Powers SHOULD be larger for Wilcoxon because it is ## optimal (as good as the best parametric test-statistic) # for logistic data. ### Let's confirm these power numbers via Monte Carlo, generating many (1e5) ### 2-sample datasets and tallying relative frequency of rejection. set.seed(1101) wmat = array(rlogis(1e5*40), c(1e5,40)) ymat = array(rlogis(1e5*40), c(1e5,40)) TW = array(0,c(1e5,2)) for(i in 1:1e5) TW[i,] = TWilc(wmat[i,],ymat[i,]) ctru =apply(TW,2,quantile,prob=c(.025,.975)) [,1] [,2] 2.5% -2.000969 0.372500 97.5% 1.988591 0.626875 ## by symmetry, resp about 0 and 1/2 # take true cutoffs to be (-1.9948,1.9948) for t # and (.3728,.6272) for Wilc for(d in 1:5) { for(i in 1:1e5) TW[i,] = TWilc(wmat[i,],d*0.2+ymat[i,]) cat(c(mean(abs(TW[,1])> 1.9948), mean(abs(TW[,2]-0.5)>0.1272)),"\n") } 0.07652 0.07986 0.16362 0.17408 0.31008 0.33175 0.49546 0.52599 0.68241 0.71479 # Not good agreement with the powers in rows 2:6. Why not ? ## Repeat the "true power" calculation again (with separate random numbers) # and get essentially the samre result for(d in 1:5) { wmat = array(rlogis(1e5*40),c(1e5,40)) ymat = array(rlogis(1e5*40,0.2*d),c(1e5,40)) for(i in 1:1e5) TW[i,] = TWilc(wmat[i,],ymat[i,]) cat(c(mean(abs(TW[,1])> 1.9948), mean(abs(TW[,2]-0.5)>0.1272)),"\n") } 0.07635 0.07922 0.16596 0.17478 0.3124 0.33094 0.49765 0.52811 0.68434 0.7162 #################################################### To repair this: I conjectured that I should replace the centered sampled (bootstrapped) batches by centered and scaled samples PermBoot2 = function(wvec, yvec, Delta=0, Rep=1000, keep.Perm=T) { ## function for permutational and bootstrap quantiles ## recode to accommodate shifting by Delta to get power estimates m = length(wvec) n = length(yvec) xvec = c(wvec,yvec) avg.w = mean(wvec) avg.y = mean(yvec) sd.w = sd(wvec) sd.y = sd(yvec) winds = 1:m yinds = m+(1:n) if(keep.Perm) unifarr = array(runif((m+n)*Rep), c(Rep,m+n)) bootarr = cbind(array(sample(wvec,m*Rep,replace=T), c(Rep,m)), array(sample(yvec,n*Rep,replace=T), c(Rep,n))) # Rep x (m+n) bootout = array(0, c(Rep,2), dimnames=list(NULL,c("T","Wilc"))) if(keep.Perm) permout = bootout for(i in 1:Rep) { if(keep.Perm) { permsamp = xvec[order(unifarr[i,])] permout[i,] = TWilc(permsamp[winds],permsamp[yinds]) } bootout[i,] = TWilc((bootarr[i,winds]-avg.w)*sd.w*(1-1/m)/ sd(bootarr[i,winds]), (bootarr[i,yinds]-avg.y+Delta)*sd.y*(1-1/n)/ sd(bootarr[i,yinds])) } list(bootout=bootout, permout= if(keep.Perm) permout else NULL) } PowCalc2 = function(wvec, yvec, B=1000, Delta=c(0,1)) { ### first entry of Delta must be 0 m = length(wvec) n = length(yvec) wbar = mean(wvec) ybar = mean(yvec) sd.w = sd(wvec) sd.y = sd(yvec) yinds = m+(1:n) d = length(Delta) bootarr = cbind(array(sample(wvec, B*m, replace=T), c(B,m)), array(sample(yvec, B*n, replace=T), c(B,n))) outstats = array(0, c(B,d,2)) outPow = array(0, c(d,2)) for(i in 1:d) for(b in 1:B) outstats[b,i,] = TWilc((bootarr[b,1:m]-wbar)* ((1-1/m)*sd.w)/sd(bootarr[b,1:m]), (bootarr[b,yinds]-ybar+Delta[i])*((1-1/n)*sd.y)/ sd(bootarr[b,yinds])) cmat = apply(outstats[,1,],2,quantile, prob=c(.025,.975)) apply( array(outstats - rep(cmat[1,],rep(B*d,2)) < 0,c(B,d,2)) | array(outstats - rep(cmat[2,],rep(B*d,2)) > 0,c(B,d,2)), 2:3, mean) } set.seed(1097) wvec = rlogis(40); yvec = rlogis(40,1) bootout = PermBoot2(wvec,yvec, keep.Perm=F)$bootout round(rbind(apply(bootout, 2, quantile, prob=c(.025,.975)), TWilc(wvec,yvec)),4) T Wilc 2.5% -2.0066 0.3449 97.5% 2.0486 0.5963 3.4226 0.7031 > PowCalc2(wvec,yvec,Delta=(0:5)/5) [,1] [,2] [1,] 0.050 0.050 [2,] 0.100 0.099 [3,] 0.218 0.274 [4,] 0.420 0.471 [5,] 0.616 0.663 [6,] 0.815 0.865 ###------------------------------------------------------------- # Try the same simulation again with a different sample: set.seed(1106) wvec = rlogis(40); yvec = rlogis(40,1) PowCalc(wvec,yvec,Delta=(0:5)/5) [,1] [,2] [1,] 0.050 0.049 [2,] 0.082 0.075 [3,] 0.154 0.146 [4,] 0.282 0.289 [5,] 0.460 0.497 [6,] 0.660 0.685 > PowCalc2(wvec,yvec,Delta=(0:5)/5) [,1] [,2] [1,] 0.050 0.049 [2,] 0.089 0.092 [3,] 0.173 0.176 [4,] 0.306 0.301 [5,] 0.480 0.485 [6,] 0.645 0.657 ## This one is much more in line with the little power table calculated by Monte Carlo above, 0.07635 0.07922 0.16596 0.17478 0.3124 0.33094 0.49765 0.52811 0.68434 0.7162 ### Let's try the corresponding simulation with larger samples of 70 and 100 set.seed(1111) wvec = rlogis(70); yvec = rlogis(100,1) > PowCalc(wvec,yvec,Delta=(0:5)/5) [,1] [,2] [1,] 0.050 0.050 [2,] 0.099 0.125 [3,] 0.258 0.319 [4,] 0.525 0.573 [5,] 0.782 0.842 [6,] 0.926 0.942 > PowCalc2(wvec,yvec,Delta=(0:5)/5) [,1] [,2] [1,] 0.050 0.050 [2,] 0.099 0.097 [3,] 0.242 0.258 [4,] 0.491 0.515 [5,] 0.760 0.784 [6,] 0.914 0.924 ## Here the Monte Carlo power table is as follows: set.seed(1113) wmat = array(rlogis(1e5*70), c(1e5,70)) ymat = array(rlogis(1e5*100), c(1e5,100)) TW = array(0,c(1e5,2)) for(i in 1:1e5) TW[i,] = TWilc(wmat[i,],ymat[i,]) ctru =apply(TW,2,quantile,prob=c(.025,.975)) [,1] [,2] 2.5% -1.976045 0.4118571 97.5% 1.963361 0.5881429 for(d in 1:5) { wmat = array(rlogis(1e5*70),c(1e5,70)) ymat = array(rlogis(1e5*100,0.2*d),c(1e5,100)) for(i in 1:1e5) TW[i,] = TWilc(wmat[i,],ymat[i,]) cat(c(mean(abs(TW[,1])> 1.970), mean(abs(TW[,2]-0.5)>0.0881)),"\n") } 0.11094 0.11576 0.29415 0.31499 0.56711 0.60202 0.80601 0.83774 0.94082 0.95723 ##==================================================================== # ONE MORE THING WE CAN TRY IS TO SEE WHETHER THE AVERAGE OF MANY (300) # MONTE-CARLO REPETITIONS OF THE BOOTSTRAP POWERS ARE ACCURATE. #(WE CAN ALSO TAKE STANDARD DEVIATIONS OF THE CALCULATED POWERS TO # UNDERSTAND JUST HOW VARIABLE THEY ARE.) PowArr = array(0, c(300,5,2), dimnames=list(NULL, paste0("Delta=",(1:5)/5),c("T","Wilc"))) set.seed(1117) for(j in 1:300) { wvec = rlogis(40); yvec = rlogis(40,1) PowArr[j,,] = PowCalc(wvec,yvec,Delta=(0:5)/5)[2:6,] } ## For this resuting array, the mean powers are: > apply(PowArr,2:3,mean) T Wilc Delta=0.2 0.0828 0.0825 Delta=0.4 0.1793 0.1810 Delta=0.6 0.3370 0.3374 Delta=0.8 0.5292 0.5308 Delta=1 0.7096 0.7098 ## and this should be compared to our previous # Monte Carlo calculation of powers: 0.07635 0.07922 0.16596 0.17478 0.3124 0.33094 0.49765 0.52811 0.68434 0.7162 ### The agreement here looks quite good! # and to confirm that the calculated bootstrap powers were in # fact somewhat noisy, here are the standard deviations # across the 300 Monte Carlo iterations: > round( apply(PowArr,2:3,sd), 4) T Wilc Delta=0.2 0.0123 0.0141 Delta=0.4 0.0361 0.0422 Delta=0.6 0.0675 0.0745 Delta=0.8 0.0934 0.1024 Delta=1 0.0964 0.1020 ## At this point, our agreement between Bootstrap and MC ## calculated powers is good enough. But the variability ## of the bootstrap powers might tell us that we can use ## those estimated powers as only a ballpark guess at the ## actual power values. Still, they might be useful ...