HW10 Solution ============= datfram = InsectSprays ## The first 12 observations are with spray "A", the next 12 with spray "B", etc. ## For the negative binomial, let mu = r(1-p)/p and sigsq = r(1-p)/p^2 = mu/p ## parm1 = (mu1, mu3, p1, p3, p4, p5, p6) is form of parameter in Anegllk ## parm2 = (a1, a3, b1, b3, b4, b5, b6) is form of parameter in Bnegllk Anegllk = function(parm1, counts) { mu = parm1[c(1,1,2,2,2,1)] p = parm1[c(3,3:7)] ### convert from mu and p < 1 to r parm = cbind(p = p, r = mu*p/(1-p)) ctarr = array(counts,c(12,6)) llk=0 for(i in 1:6) llk = llk+sum(log(dnbinom(ctarr[,i],parm[i,2],parm[i,1]))) -llk } trNorm = function(vec,mu,sig) { prbs = (pnorm((vec-mu+1)/sig)-pnorm((vec-mu)/sig))/pnorm(mu/sig) list(llk = sum(log(prbs)), probs=prbs) } Bnegllk = function(parm2, counts) { parm = cbind(parm2[c(1,1,2,2,2,1)],parm2[c(3,3:7)]) ctarr = array(counts,c(12,6)) llk=0 for(i in 1:6) llk = llk+trNorm(ctarr[,i],parm[i,1],sqrt(parm[i,2]))$llk -llk } #### Now try this out: Afit = optim( c(15,3,3/4,1/2,2/3,6/7,.45), Anegllk, counts=InsectSprays$count, method="L-BFGS-B", lower=c(2,1,rep(0,5)), upper=c(Inf,Inf,rep(0.99,5)), control=list(maxit=500)) $par [1] 15.3273637 3.8840934 0.7724956 0.2539672 0.8102891 0.9900000 0.4738192 $value [1] 183.1616 $counts function gradient 16 16 $convergence [1] 0 $message [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" NBparm = function(parm) { ### function to convert the mu,p parameters as in Afit to r,p mu = parm[c(1,1,2,2,2,1)] p = parm[c(3,3:7)] ### convert from mu and p < 1 to r cbind(r = mu*p/(1-p), p = p) } > NBparm(Afit$par) #### apparent optimum, (r=52, p=.7725) for A,B r p #### (r=1.322, p=0.254) for C [1,] 52.044370 0.7724956 #### (r=16.59, p=0.810) for D [2,] 52.044370 0.7724956 #### (r=384.52, p=0.99) for E or N(3.845, 3.88) [3,] 1.322237 0.2539672 #### (r= 13.80, p=0.474) for F [4,] 16.589656 0.8102891 [5,] 384.525246 0.9900000 [6,] 13.802097 0.4738192 > Bnegllk(c(15,3,4.5,2,2.5,2,6.2), InsectSprays$count) [1] 245.5658 > Bfit = optim( c(15,4,4.5,2,2.5,2,6.2), Bnegllk, counts=InsectSprays$count, method="L-BFGS-B", lower=c(2,0.5,rep(1e-2,5))) $par [1] 15.734347 3.135267 18.801759 4.959303 15.464463 4.351583 39.023900 $value [1] 185.0598 $counts function gradient 61 61 $convergence [1] 0 $message [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" ### We can see that the log-likelihood is a bit better (-180.2 versus -185.1) ### using negative-binomial versus truncated normal ### We plot the two sets of pmf's, red points for negbin versus blue line for truncNor > x11(width=6, height=4); par(mfrow=c(2,3)) xlims = cbind(c(0,30),c(0,15)) ind1= c(1,2,6,3:5); ind2 = rep(1:2, rep(3,2)) ind3 = c(3,3,7,4:6) parmAB = rbind(parmA = Afit$par, parmB = Bfit$par) > parmAB [,1] [,2] [,3] [,4] [,5] [,6] [,7] parmA 15.32736 3.884093 0.7724956 0.2539672 0.8102891 0.990000 0.4738192 parmB 15.73435 3.135267 18.8017591 4.9593030 15.4644626 4.351583 39.0239003 > c(parmAB[1,1:2], parmAB[1,c(1,2,2,2,1)]/parmAB[1,3:7]) [1] 15.327364 3.884093 19.841360 15.293681 4.793466 3.923327 32.348551 ### mean and variance parameters from parmA, to compare with parmB > c(mean(datfram$count[c(1:24,61:72)]),mean(datfram$count[25:60]), var(datfram$count[1:24]), var(datfram$count[25:36]), var(datfram$count[37:48]), var(datfram$count[49:60]), var(datfram$count[61:72])) [1] 15.500000 3.500000 19.557971 3.901515 6.265152 3.000000 38.606061 ### mean and variance parameters from original grouped data, to compare with parmB > tmp = NBparm(Afit$par) for(i in 1:6) { hist(datfram$count[(ind1[i]-1)*12+(1:12)], xlim=xlims[,ind2[i]], prob=T, xlab="count", ylab="prob.mass", main=paste("spray Group",LETTERS[ind1[i]])) points(0:xlims[2,ind2[i]], dnbinom(0:xlims[2,ind2[i]], tmp[ind1[i],1], tmp[ind1[i],2]), col="red") lines(0:xlims[2,ind2[i]], trNorm(0:xlims[2,ind2[i]], parmAB[2,ind2[i]], sqrt(parmAB[2,ind3[i]]))$pr, col="blue") } ### saved as "SprayPics.pdf" ### Provides a nice picture of the difference in fit. Can we say which is better ? ### Log-likelihood says negative-binomial; next try PARAMETRIC BOOTSTRAP or Monte-Carlo ### This completes part (A). ### Use the integer-valued dist's to simulate, with max possible value 49 parr = array(0, c(50,6,2), dimnames=list(NULL,LETTERS[1:6],c("nbin","tNor"))) for(i in 1:3) { j=i+3 parr[,i,1] = dnbinom(0:49, tmp[i,1], tmp[i,2]) parr[,j,1] = dnbinom(0:49, tmp[j,1], tmp[j,2]) parr[,ind1[i],2] = trNorm(0:49, parmAB[2,ind2[i]], sqrt(parmAB[2,ind3[i]]))$pr parr[,ind1[j],2] = trNorm(0:49, parmAB[2,ind2[j]], sqrt(parmAB[2,ind3[j]]))$pr } > parr[1,,2] A B C D E F 0.0001969165 0.0001969165 0.0969540788 0.1027812076 0.0927507751 0.0033011615 ### This shows that zero-counts are not unlikely in simulated groups C,D,E !!!? ### Now for the simulations, 10,000 datasets from each model DatNBin = DatTNor = array(0, c(1e4,72)) set.seed(7779) for(k in 1:6) { j=(k-1)*12 for(i in 1:1e4) { DatNBin[i,j+(1:12)] = sample(0:49, 12, rep=T, prob=parr[,k,1]) DatTNor[i,j+(1:12)] = sample(0:49, 12, rep=T, prob=parr[,k,2]) } } > apply(parr,2:3,sum) nbin tNor A 1.0000000 1 B 1.0000000 1 C 0.9999989 1 D 1.0000000 1 E 1.0000000 1 F 0.9999916 1 > summary(apply(DatNBin[,c(1:24,61:72)],1,max)) Min. 1st Qu. Median Mean 3rd Qu. Max. 18.00 25.00 27.00 27.18 29.00 49.00 > summary(apply(DatTNor[,c(1:24,61:72)],1,max)) Min. 1st Qu. Median Mean 3rd Qu. Max. 19.00 24.00 26.00 26.33 28.00 44.00 > summary(apply(DatTNor[,25:60],1,max)) Min. 1st Qu. Median Mean 3rd Qu. Max. 5.000 8.000 9.000 9.643 11.000 20.000 > summary(apply(DatNBin[,25:60],1,max)) Min. 1st Qu. Median Mean 3rd Qu. Max. 5.0 9.0 11.0 12.4 14.0 42.0 ### OK, now fit both models to both datasets to obtain reference distributions for MLEs ### Must decide on what parameters to calculate and use to generate reference dist's ### Let's calculate the 7 parameter estimators, and separately assess the expected ### range (max-min) count for each group. > sprvec = rep(1:6, rep(12,6)) sum(abs(sprvec-as.numeric(datfram$spray))) ### = 0 > RngNB = t(apply(DatNBin, 1, function(row) apply(array(row,c(12,6)),2, function(col) max(col)-min(col)))) > summary(RngNB) V1 V2 V3 V4 V5 Min. : 5.00 Min. : 4.00 Min. : 2.00 Min. : 2.00 Min. : 2.00 1st Qu.:12.00 1st Qu.:12.00 1st Qu.: 9.00 1st Qu.: 6.00 1st Qu.: 5.00 Median :14.00 Median :14.00 Median :11.00 Median : 7.00 Median : 6.00 Mean :14.42 Mean :14.44 Mean :11.81 Mean : 6.95 Mean : 6.32 3rd Qu.:17.00 3rd Qu.:17.00 3rd Qu.:14.00 3rd Qu.: 8.00 3rd Qu.: 7.00 Max. :32.00 Max. :31.00 Max. :42.00 Max. :17.00 Max. :14.00 V6 Min. : 6.00 1st Qu.:15.00 Median :18.00 Mean :18.36 3rd Qu.:21.00 Max. :45.00 > RngTN = t(apply(DatTNor, 1, function(row) apply(array(row,c(12,6)),2, function(col) max(col)-min(col)))) > summary(RngTN) V1 V2 V3 V4 Min. : 4.00 Min. : 4.00 Min. : 2.000 Min. : 3.000 1st Qu.:12.00 1st Qu.:12.00 1st Qu.: 5.000 1st Qu.: 8.000 Median :14.00 Median :14.00 Median : 6.000 Median : 9.000 Mean :14.09 Mean :14.14 Mean : 6.015 Mean : 9.172 3rd Qu.:16.00 3rd Qu.:16.00 3rd Qu.: 7.000 3rd Qu.:11.000 Max. :31.00 Max. :31.00 Max. :12.000 Max. :20.000 V5 V6 Min. : 1.000 Min. : 7.00 1st Qu.: 5.000 1st Qu.:17.00 Median : 6.000 Median :20.00 Mean : 5.768 Mean :19.77 3rd Qu.: 7.000 3rd Qu.:23.00 Max. :13.000 Max. :38.00 > GpRang = aggregate(datfram$count, by=list(LETTERS[sprvec]), function(col) max(col)-min(col))$x GpRang [1] 16 14 7 10 5 17 ### We can see from this that with data generated from NBin, all of the actual ### ranges are in the middle of the distribution of ranges. ###----------------------------------------------------------------------- ### So to complete part (B), we consider a statistic that looks at the largest observations ### in a way that depends on the ranges of observations scaled by group to the group mean: ### S = max_{groups g} (max_{i in g} X_i/mu-hat_g) ### where mu-hat_g is the MLE for the group mean parameter mu_g. ###-------------------------------------------------------------------------------- ### Start with arrays of MLEs based on truncNormal and NBin fits to the simulated data. ### The third array dimension is 1 when the ML fits are from NBin data, or 2 when ### the MLfit is from the truncNormal data. ModfitA = array(0, c(1e4, 7, 2)) gpnums = rep(1:6, rep(12,6)) ### these are the group labels, as numbers > for (i in 1:10000) { tmp = optim( Afit$par, Anegllk, counts=DatNBin[i,], method="L-BFGS-B", lower=c(1,1,rep(1e-2,5)), upper=c(1e3,1e3,rep(0.99,5)), control=list(maxit=500)) if(tmp$conv==0) ModfitA[i,,1] = tmp$par } > summary(ModfitA[,1,1] Min. 1st Qu. Median Mean 3rd Qu. Max. 12.61 14.78 15.33 15.34 15.87 18.79 ### no 0's, so all converged !! > summary(ModfitA[,2,1]) Min. 1st Qu. Median Mean 3rd Qu. Max. 2.275 3.611 3.893 3.899 4.170 5.650 ### These were mu parameters in NegBin case; histograms of MLEs of p's show there ### are large fractions in all groups other than 4 for which p is close to 1 !! ### Next make parmB a vector of mu and sigsq parameters as in model B from Afit$par: > parmB = c(Afit$par[1:2], Afit$par[c(1,2,2,2,1)]/Afit$par[c(3:7)]) [1] 15.327364 3.884093 19.841360 15.293681 4.793466 3.923327 32.348551 ### Note that these mu, sigsq parameters differ markedly in groups 3,4 from those in Model B !! > Bfit$par [1] 15.734347 3.135267 18.801759 4.959303 15.464463 4.351583 39.023900 for (i in 1:100) { aux = aggregate(DatNBin[i,]$count, by=list(gpnums), function(col) c(mean(col), var(col)))$x tmp = optim( c(mean(aux[c(1,2,6),1]), mean(aux[3:5,1]), sd(aux, Bnegllk, counts=DatNBin[i,], method="L-BFGS-B", lower=c(2,0.5,rep(1e-2,5))) if(tmp$conv==0) ModfitA[i,,2] = tmp$par } > Bnegllk2 = function(parm2, counts) { parm = cbind(parm2[c(1,1,2,2,2,1)],exp(0.5*parm2[c(3,3:7)])) ctarr = array(counts,c(12,6)) llk=0 for(i in 1:6) llk = llk+trNorm(ctarr[,i],parm[i,1],parm[i,2])$llk -llk } parmB2 = c(parmB[1:2], log(parmB[3:7])) for (i in c(1:5000, 6001:7000, 9001:1e4)) { tmp = optim( parmB2, Bnegllk2, counts=DatNBin[i,], method="BFGS") if(tmp$conv==0) ModfitA[i,,2] = tmp$par } #### all converged, also in range 6001:7000 and 9001:1e4, but "optim" failed to work #### in other simulated rows in range 5001:6000, 7001:9000 for (i in c(5001:6e3, 7001:9e3)) { parmB2tmp = c(mean(DatNBin[i,c(1:24,61:72)]),mean(DatNBin[i,25:60]), log(c(var(DatNBin[i,1:24]), var(DatNBin[i,25:36]), var(DatNBin[i,37:48]), var(DatNBin[i,49:60]), var(DatNBin[i,61:72])))) tmp = optim( parmB2tmp, Bnegllk2, counts=DatNBin[i,], method="BFGS") if(tmp$conv==0) ModfitA[i,,2] = tmp$par } ### now all converged ##### Now complete the reference-group analysis using NegBin MLE fits in constructing #### reference distribution for the groupwise max/mean statistic: ### First, in the original dataset: > apply(array(datfram$count, c(12,6)),2, max)/Afit$par[c(1,1,2,2,2,1)] [1] 1.500584 1.370099 1.802222 3.089524 1.544762 1.696313 ### so overall max of these is 3.0895 ### Now create reference distribution, using ModfitA[,1:2,1] to provide (A,B,F) ### and (C,D,E) group mean MLEs from NegBin fit > tmparr = apply(array(DatNBin, c(1e4,12,6)), c(1,3), max)/ModfitA[,c(1,1,2,2,2,1),1] ### this is 10000 x 6 array containing groupwise ratios of max over MLE-mean Sstat = apply(tmparr,1, max) > hist(Sstat, nclass=50, prob=T, xlab="Sstat", ylab="Density", main="Reference Hist for 1e4 simulated Sstat values") abline(v=3.0895, col="red") > mean(Sstat > 3.0895) [1] 0.4346 ### so the original statistic value is not unduly large #### compared to reference distribution !! ###>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ### As noted above, there are many zero-counts among the C,D,E groups in DatTNor > parr[1,,2] A B C D E F 0.0001969165 0.0001969165 0.0969540788 0.1027812076 0.0927507751 0.0033011615 ### This shows that zero-counts are not unlikely in simulated groups C,D,E !!!? > sum(DatTNor[,25:60]==0) #### 0's in simulated group C,D,E a little less than 10% [1] 35296 > sum(DatTNor[,c(1:24,61:72)]==0) ### in other groups few 0'a [1] 455 ### So we do not repeat the "reference group" analysis in the DatTNor simulated datasets, ### because it is OBVIOUS that the DatTNor datasets are very UNLIKE the original ### dataset in the noumber of 0's they contain in groups C,D,E !!! ### OVERALL CONCLUSION IS THAT DESPITE THE VERY CLOSE MAXIMIZED LOG-LIKELIHOOD VALUES ### FOR THE TRUE MODELS, THE truncated Normal MODEL IS NOT REASONABLE WHILE THE ### NEGATIVE BINOMIAL FIT IS NOT BAD, DESPITE THE PROBLEM THAT THE VARIANCE IN GROUP E ### MIGHT ACTUALLY BE SMALLER THAN THE MEAN, WHICH IS IMPOSSIBLE FOR NEG BINOM.