HW10 Solution
=============

datfram = InsectSprays

## The first 12 observations are with spray "A", the next 12 with spray "B", etc.

## For the negative binomial, let  mu = r(1-p)/p  and  sigsq = r(1-p)/p^2 = mu/p

## parm1 = (mu1, mu3, p1, p3, p4, p5, p6)   is form of parameter in Anegllk

## parm2 = (a1, a3, b1, b3, b4, b5, b6)     is form of parameter in Bnegllk

Anegllk = function(parm1, counts) {
       mu = parm1[c(1,1,2,2,2,1)]
       p = parm1[c(3,3:7)]
     ### convert from mu and p < 1 to r
       parm = cbind(p = p, r = mu*p/(1-p))
       ctarr = array(counts,c(12,6))
       llk=0
       for(i in 1:6) llk = llk+sum(log(dnbinom(ctarr[,i],parm[i,2],parm[i,1])))
       -llk }

trNorm = function(vec,mu,sig) {
             prbs = (pnorm((vec-mu+1)/sig)-pnorm((vec-mu)/sig))/pnorm(mu/sig)
             list(llk = sum(log(prbs)), probs=prbs) }

Bnegllk = function(parm2, counts) {
       parm = cbind(parm2[c(1,1,2,2,2,1)],parm2[c(3,3:7)])
       ctarr = array(counts,c(12,6))
       llk=0
       for(i in 1:6) llk = llk+trNorm(ctarr[,i],parm[i,1],sqrt(parm[i,2]))$llk
       -llk }

#### Now try this out:

Afit = optim( c(15,3,3/4,1/2,2/3,6/7,.45), Anegllk, counts=InsectSprays$count, 
           method="L-BFGS-B", lower=c(2,1,rep(0,5)), upper=c(Inf,Inf,rep(0.99,5)),
           control=list(maxit=500))
$par
[1] 15.3273637  3.8840934  0.7724956  0.2539672  0.8102891  0.9900000  0.4738192

$value
[1] 183.1616

$counts
function gradient 
      16       16 

$convergence
[1] 0

$message
[1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

NBparm = function(parm) {
       ### function to convert the mu,p parameters as in Afit to r,p
       mu = parm[c(1,1,2,2,2,1)]
       p = parm[c(3,3:7)]
     ### convert from mu and p < 1 to r
       cbind(r = mu*p/(1-p), p = p) }

> NBparm(Afit$par)          #### apparent optimum, (r=52, p=.7725) for A,B
              r         p   ####    (r=1.322, p=0.254) for C
[1,]  52.044370 0.7724956   ####    (r=16.59, p=0.810) for D
[2,]  52.044370 0.7724956   ####    (r=384.52, p=0.99) for E or N(3.845, 3.88)
[3,]   1.322237 0.2539672   ####    (r= 13.80, p=0.474) for F
[4,]  16.589656 0.8102891
[5,] 384.525246 0.9900000
[6,]  13.802097 0.4738192


> Bnegllk(c(15,3,4.5,2,2.5,2,6.2), InsectSprays$count)
[1] 245.5658

> Bfit = optim( c(15,4,4.5,2,2.5,2,6.2), Bnegllk, counts=InsectSprays$count, 
           method="L-BFGS-B", lower=c(2,0.5,rep(1e-2,5)))
$par
[1] 15.734347  3.135267 18.801759  4.959303 15.464463  4.351583 39.023900

$value
[1] 185.0598

$counts
function gradient 
      61       61 

$convergence
[1] 0

$message
[1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

### We can see that the log-likelihood is a bit better (-180.2 versus -185.1)
###   using negative-binomial versus truncated normal

### We plot the two sets of pmf's, red points for negbin versus blue line for truncNor

> x11(width=6, height=4);   par(mfrow=c(2,3))
  xlims = cbind(c(0,30),c(0,15))
  ind1= c(1,2,6,3:5); ind2 = rep(1:2, rep(3,2))
  ind3 = c(3,3,7,4:6)
  parmAB = rbind(parmA = Afit$par, parmB = Bfit$par)
> parmAB
          [,1]     [,2]       [,3]      [,4]       [,5]     [,6]       [,7]
parmA 15.32736 3.884093  0.7724956 0.2539672  0.8102891 0.990000  0.4738192
parmB 15.73435 3.135267 18.8017591 4.9593030 15.4644626 4.351583 39.0239003

> c(parmAB[1,1:2], parmAB[1,c(1,2,2,2,1)]/parmAB[1,3:7])
[1] 15.327364  3.884093 19.841360 15.293681  4.793466  3.923327 32.348551
### mean and variance parameters from parmA, to compare with parmB

> c(mean(datfram$count[c(1:24,61:72)]),mean(datfram$count[25:60]), var(datfram$count[1:24]), 
    var(datfram$count[25:36]), var(datfram$count[37:48]),
           var(datfram$count[49:60]), var(datfram$count[61:72]))
[1] 15.500000  3.500000 19.557971  3.901515  6.265152  3.000000 38.606061
### mean and variance parameters from original grouped data, to compare with parmB

>  tmp = NBparm(Afit$par)
   for(i in 1:6) {     
      hist(datfram$count[(ind1[i]-1)*12+(1:12)], xlim=xlims[,ind2[i]], prob=T,
         xlab="count", ylab="prob.mass", main=paste("spray Group",LETTERS[ind1[i]]))
      points(0:xlims[2,ind2[i]], dnbinom(0:xlims[2,ind2[i]], tmp[ind1[i],1], 
         tmp[ind1[i],2]), col="red")
      lines(0:xlims[2,ind2[i]], trNorm(0:xlims[2,ind2[i]], parmAB[2,ind2[i]], 
         sqrt(parmAB[2,ind3[i]]))$pr, col="blue") }

### saved as "SprayPics.pdf"
### Provides a nice picture of the difference in fit. Can we say which is better ?
### Log-likelihood says  negative-binomial; next try PARAMETRIC BOOTSTRAP or Monte-Carlo

### This completes part (A).

### Use the integer-valued dist's to simulate, with max possible value 49 

parr = array(0, c(50,6,2), dimnames=list(NULL,LETTERS[1:6],c("nbin","tNor")))

for(i in 1:3) {
    j=i+3
    parr[,i,1] = dnbinom(0:49, tmp[i,1], tmp[i,2])
    parr[,j,1] = dnbinom(0:49, tmp[j,1], tmp[j,2])
    parr[,ind1[i],2] = trNorm(0:49, parmAB[2,ind2[i]], sqrt(parmAB[2,ind3[i]]))$pr 
    parr[,ind1[j],2] = trNorm(0:49, parmAB[2,ind2[j]], sqrt(parmAB[2,ind3[j]]))$pr }

> parr[1,,2]
           A            B            C            D            E            F 
0.0001969165 0.0001969165 0.0969540788 0.1027812076 0.0927507751 0.0033011615 
### This shows that zero-counts are not unlikely in simulated groups C,D,E !!!? 

### Now for the simulations, 10,000 datasets from each model

DatNBin = DatTNor = array(0, c(1e4,72))

set.seed(7779)
    for(k in 1:6) {
          j=(k-1)*12 
          for(i in 1:1e4) {
               DatNBin[i,j+(1:12)] = sample(0:49, 12, rep=T, prob=parr[,k,1])
               DatTNor[i,j+(1:12)] = sample(0:49, 12, rep=T, prob=parr[,k,2])
          } }

> apply(parr,2:3,sum)
       nbin tNor
A 1.0000000    1
B 1.0000000    1
C 0.9999989    1
D 1.0000000    1
E 1.0000000    1
F 0.9999916    1

> summary(apply(DatNBin[,c(1:24,61:72)],1,max))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18.00   25.00   27.00   27.18   29.00   49.00 
> summary(apply(DatTNor[,c(1:24,61:72)],1,max))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  19.00   24.00   26.00   26.33   28.00   44.00 
> summary(apply(DatTNor[,25:60],1,max))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  5.000   8.000   9.000   9.643  11.000  20.000 
> summary(apply(DatNBin[,25:60],1,max))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    5.0     9.0    11.0    12.4    14.0    42.0 

### OK, now fit both models to both datasets to obtain reference distributions for MLEs
### Must decide on what parameters to calculate and use to generate reference dist's
### Let's calculate the 7 parameter estimators, and separately assess the expected 
###    range (max-min) count for each group.

> sprvec = rep(1:6, rep(12,6))
  sum(abs(sprvec-as.numeric(datfram$spray)))   ### = 0

> RngNB = t(apply(DatNBin, 1, function(row) apply(array(row,c(12,6)),2, 
            function(col) max(col)-min(col))))
> summary(RngNB)
       V1              V2              V3              V4              V5       
 Min.   : 5.00   Min.   : 4.00   Min.   : 2.00   Min.   : 2.00   Min.   : 2.00  
 1st Qu.:12.00   1st Qu.:12.00   1st Qu.: 9.00   1st Qu.: 6.00   1st Qu.: 5.00  
 Median :14.00   Median :14.00   Median :11.00   Median : 7.00   Median : 6.00  
 Mean   :14.42   Mean   :14.44   Mean   :11.81   Mean   : 6.95   Mean   : 6.32  
 3rd Qu.:17.00   3rd Qu.:17.00   3rd Qu.:14.00   3rd Qu.: 8.00   3rd Qu.: 7.00  
 Max.   :32.00   Max.   :31.00   Max.   :42.00   Max.   :17.00   Max.   :14.00  
       V6       
 Min.   : 6.00  
 1st Qu.:15.00  
 Median :18.00  
 Mean   :18.36  
 3rd Qu.:21.00  
 Max.   :45.00  

> RngTN = t(apply(DatTNor, 1, function(row) apply(array(row,c(12,6)),2, 
              function(col) max(col)-min(col))))
>   summary(RngTN)
       V1              V2              V3               V4        
 Min.   : 4.00   Min.   : 4.00   Min.   : 2.000   Min.   : 3.000  
 1st Qu.:12.00   1st Qu.:12.00   1st Qu.: 5.000   1st Qu.: 8.000  
 Median :14.00   Median :14.00   Median : 6.000   Median : 9.000  
 Mean   :14.09   Mean   :14.14   Mean   : 6.015   Mean   : 9.172  
 3rd Qu.:16.00   3rd Qu.:16.00   3rd Qu.: 7.000   3rd Qu.:11.000  
 Max.   :31.00   Max.   :31.00   Max.   :12.000   Max.   :20.000  
       V5               V6       
 Min.   : 1.000   Min.   : 7.00  
 1st Qu.: 5.000   1st Qu.:17.00  
 Median : 6.000   Median :20.00  
 Mean   : 5.768   Mean   :19.77  
 3rd Qu.: 7.000   3rd Qu.:23.00  
 Max.   :13.000   Max.   :38.00  

> GpRang = aggregate(datfram$count, by=list(LETTERS[sprvec]), function(col) max(col)-min(col))$x
  GpRang
[1] 16 14  7 10  5 17

### We can see from this that with data generated from NBin, all of the actual
###   ranges are in the middle of the distribution of ranges.

###-----------------------------------------------------------------------
### So to complete part (B), we consider a statistic that looks at the largest observations
### in a way that depends on the ranges of observations scaled by group to the group mean:
###         S = max_{groups g} (max_{i in g} X_i/mu-hat_g)
### where mu-hat_g is the MLE for the group mean parameter mu_g.

###--------------------------------------------------------------------------------
### Start with arrays of MLEs based on truncNormal and NBin fits to the simulated data.
### The third array dimension is 1 when the ML fits are from NBin data, or 2 when 
###    the MLfit is from the truncNormal data.

ModfitA = array(0, c(1e4, 7, 2))
gpnums = rep(1:6, rep(12,6))              ### these are the group labels, as numbers

> for (i in 1:10000) {
     tmp = optim( Afit$par, Anegllk, counts=DatNBin[i,], method="L-BFGS-B", 
           lower=c(1,1,rep(1e-2,5)), upper=c(1e3,1e3,rep(0.99,5)),
           control=list(maxit=500))
     if(tmp$conv==0) ModfitA[i,,1] = tmp$par }

> summary(ModfitA[,1,1]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  12.61   14.78   15.33   15.34   15.87   18.79  ### no 0's, so all converged !!
> summary(ModfitA[,2,1])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.275   3.611   3.893   3.899   4.170   5.650 

### These were mu parameters in NegBin case; histograms of MLEs of p's show there 
###   are large fractions in all groups other than 4 for which p is close to 1 !!

### Next make parmB a vector of mu and sigsq parameters as in model B from Afit$par:

> parmB = c(Afit$par[1:2], Afit$par[c(1,2,2,2,1)]/Afit$par[c(3:7)])
[1] 15.327364  3.884093 19.841360 15.293681  4.793466  3.923327 32.348551

### Note that these mu, sigsq parameters differ markedly in groups 3,4 from those in Model B !!

> Bfit$par
[1] 15.734347  3.135267 18.801759  4.959303 15.464463  4.351583 39.023900


for (i in 1:100) {
       aux = aggregate(DatNBin[i,]$count, by=list(gpnums), function(col) 
           c(mean(col), var(col)))$x

       tmp  = optim( c(mean(aux[c(1,2,6),1]), mean(aux[3:5,1]), sd(aux, Bnegllk, counts=DatNBin[i,], 
           method="L-BFGS-B", lower=c(2,0.5,rep(1e-2,5)))
       if(tmp$conv==0)  ModfitA[i,,2] = tmp$par   }

> Bnegllk2 = function(parm2, counts) {
       parm = cbind(parm2[c(1,1,2,2,2,1)],exp(0.5*parm2[c(3,3:7)]))
       ctarr = array(counts,c(12,6))
       llk=0
       for(i in 1:6) llk = llk+trNorm(ctarr[,i],parm[i,1],parm[i,2])$llk
       -llk }

parmB2 = c(parmB[1:2], log(parmB[3:7]))

for (i in c(1:5000, 6001:7000, 9001:1e4)) { 
    tmp = optim( parmB2, Bnegllk2, counts=DatNBin[i,], method="BFGS")
    if(tmp$conv==0)  ModfitA[i,,2] = tmp$par }          
    #### all converged, also in range 6001:7000 and 9001:1e4, but "optim" failed to work 
    #### in other simulated rows in range 5001:6000, 7001:9000

for (i in c(5001:6e3, 7001:9e3)) { 
    parmB2tmp = c(mean(DatNBin[i,c(1:24,61:72)]),mean(DatNBin[i,25:60]),
           log(c(var(DatNBin[i,1:24]), var(DatNBin[i,25:36]), var(DatNBin[i,37:48]),
           var(DatNBin[i,49:60]), var(DatNBin[i,61:72]))))
    tmp = optim( parmB2tmp, Bnegllk2, counts=DatNBin[i,], method="BFGS")
    if(tmp$conv==0)  ModfitA[i,,2] = tmp$par  }
    ### now all converged

##### Now complete the reference-group analysis using NegBin MLE fits in constructing 
####   reference distribution for the groupwise max/mean statistic:

### First, in the original dataset:

> apply(array(datfram$count, c(12,6)),2, max)/Afit$par[c(1,1,2,2,2,1)]
[1] 1.500584 1.370099 1.802222 3.089524 1.544762 1.696313
### so overall max of these is 3.0895

### Now create reference distribution, using ModfitA[,1:2,1] to provide (A,B,F) 
###    and (C,D,E) group mean MLEs from NegBin fit

> tmparr = apply(array(DatNBin, c(1e4,12,6)), c(1,3), max)/ModfitA[,c(1,1,2,2,2,1),1]
### this is 10000 x 6 array containing groupwise ratios of max over MLE-mean
  Sstat = apply(tmparr,1, max)

> hist(Sstat, nclass=50, prob=T, xlab="Sstat", ylab="Density",
    main="Reference Hist for 1e4 simulated Sstat values")
  abline(v=3.0895, col="red")

> mean(Sstat > 3.0895)
[1] 0.4346                      ### so the original statistic value is not unduly large
                      #### compared to reference distribution !!

###>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
### As noted above, there are many zero-counts among the C,D,E groups in DatTNor

> parr[1,,2]
           A            B            C            D            E            F 
0.0001969165 0.0001969165 0.0969540788 0.1027812076 0.0927507751 0.0033011615 
### This shows that zero-counts are not unlikely in simulated groups C,D,E !!!? 

> sum(DatTNor[,25:60]==0)             #### 0's in simulated group C,D,E a little less than 10%
[1] 35296
> sum(DatTNor[,c(1:24,61:72)]==0)     ### in other groups few 0'a
[1] 455

### So we do not repeat the "reference group" analysis in the DatTNor simulated datasets,
### because it is OBVIOUS that the DatTNor datasets are very UNLIKE the original 
### dataset in the noumber of 0's they contain in groups C,D,E !!!

### OVERALL CONCLUSION IS THAT DESPITE THE VERY CLOSE MAXIMIZED LOG-LIKELIHOOD VALUES 
###   FOR THE TRUE MODELS, THE truncated Normal MODEL IS NOT REASONABLE WHILE THE 
###   NEGATIVE BINOMIAL FIT IS NOT BAD, DESPITE THE PROBLEM THAT THE VARIANCE IN GROUP E 
###   MIGHT ACTUALLY BE SMALLER THAN THE MEAN, WHICH IS IMPOSSIBLE FOR NEG BINOM.