Lecture 5.R: illustration of Chi-square goodness of fit & independence tests, 
and glm model fits for Poisson Regression in famous dataset on count of deaths 
by horse-kick in the Prussian army, 1875-94.
----- 

1. DISPLAYING THE DATA

> horsek = read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/pscl/prussian.csv")
> dim(horsek)
[1] 280   4

> table(horsek$year)
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 
14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14

> table(horsek$corp)
   G    I   II  III   IV   IX    V   VI  VII VIII    X   XI  XIV   XV 
  20   20   20   20   20   20   20   20   20   20   20   20   20   20 

### So this is a 20x14 cross-classification of counts of death by horsekick by 
###   corps and year listed by successive years within corps

> horskarr = array( horsek$y, c(20,14), dimnames=list(75:94,
      names(table(horsek$corp))) )
   horskarr
   G I II III IV IX V VI VII VIII X XI XIV XV
75 0 0  0   0  0  0 0  1   1    0 0  0   1  0
76 2 0  0   0  1  0 0  0   0    0 0  0   1  1
77 2 0  0   0  0  0 1  1   0    0 1  0   2  0
78 1 2  2   1  1  0 0  0   0    0 1  0   1  0
79 0 0  0   1  1  2 2  0   1    0 0  2   1  0
80 0 3  2   1  1  1 0  0   0    2 1  4   3  0
81 1 0  0   2  1  0 0  1   0    1 0  0   0  0
82 1 2  0   0  0  0 1  0   1    1 2  1   4  1
83 0 0  1   2  0  1 2  1   0    1 0  3   0  0
84 3 0  1   0  0  0 0  1   0    0 2  0   1  1
85 0 0  0   0  0  0 1  0   0    2 0  1   0  1
86 2 1  0   0  1  1 1  0   0    1 0  1   3  0
87 1 1  2   1  0  0 3  2   1    1 0  1   2  0
88 0 1  1   0  0  1 1  0   0    0 0  1   1  0
89 0 0  1   1  0  1 1  0   0    1 2  2   0  2
90 1 2  0   2  0  1 1  2   0    2 1  1   2  2
91 0 0  0   1  1  1 0  1   1    0 3  3   1  0
92 1 3  2   0  1  1 3  0   1    1 0  1   1  0
93 0 1  0   0  0  1 0  2   0    0 1  3   0  0
94 1 0  0   0  0  0 0  0   1    0 1  1   0  0

> yeartots = apply(horskarr,1,sum)
   yeartots
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 
 3  5  7  9 10 18  6 14 11  9  5 11 15  6 11 17 12 15  8  4


> c(mean(yeartots), sum(yeartots))    ### = 9.8
[1]   9.8 196.0

> t(array(round(dpois(0:19,9.8), 4),c(10,2)))
       [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]  [,9]  [,10]
[1,] 0.0001 0.0005 0.0027 0.0087 0.0213 0.0418 0.0682 0.0955 0.117 0.1274
[2,] 0.1249 0.1112 0.0908 0.0685 0.0479 0.0313 0.0192 0.0111 0.006 0.0031

### These are the Poisson probabilities with the mean 9.8 as parameter

## Could tally how many kicks within single years, ideally Poisson(9.8)

> Kicks = table(yeartots)
yeartots
 3  4  5  6  7  8  9 10 11 12 14 15 17 18    
 1  1  2  2  1  1  2  1  3  1  1  2  1  1 

### Group these into cells 0:4,5:8, 9:12, 13:16, 17+
> Kick.counts = table(cut(as.numeric(yeartots), breaks=c(0,4,8,12,16,25)))
  Kick.counts
  (0,4]   (4,8]  (8,12] (12,16] (16,25] 
      2       6       7       3       2 
## Corresponding Poisson(9.8) probabilities
> Kick.Exp = 20*diff(c(0,ppois(c(4,8,12,16,25),9.8)))
> round(Kick.Exp, 3)
[1] 0.665 6.450 9.087 3.338 0.459


2. TESTING GOODNESS OF FIT OF THE POISSON, BY YEAR

## So we are ready to do a goodness of fit test on the Poisson grouped
##   into these intervals 0:4,5:8, 9:12, 13:16, 17:25

> sum((Kick.counts-Kick.Exp)^2/Kick.Exp)
[1] 8.398574                    ### actually calculated using round(Kick.Exp,3)

>  chisq.test(Kick.counts, p=Kick.Exp, rescale.p=T)   ## rescale because
                                                     ## p's did not sum to 1

        Chi-squared test for given probabilities

data:  Kick.counts
X-squared = 8.3982, df = 4, p-value = 0.07804    should be   1 - pchisq(8.3982, 4)

#### There are at least three problems with using this grouped-data test:
## First is that the 9.8 Poisson mean was estimated, not hypothesized 
## in advance: we will deal with that later.

## Second is that the expected values in the end cells are < 1 so we may 
##  worry that the Xsq statistic is not close to chisq.

## And Third is that in this setting the LRT is not close to Xsq

## Let's deal with the Second worry directly, by simulation:

## The simulated p-value from chisq.test according to exact multinomial is
##   obtained using different arguments in the chisq.test functioon
## First we do it in the setting where 9.8 Poisson parameter is treated as 
##   fixed, not estimated

> chisq.test(Kick.counts, p=Kick.Exp, rescale.p=T, simulate.p.value=T, B=5000)

         Chi-squared test for given probabilities with simulated p-value 
           (based on 5000 replicates)

data:  Kick.counts
X-squared = 8.3982, df = NA, p-value = 0.08338  
                                        ## not too different from 0.078

## Simulation was done this way by the R function.

>  pvec= Kick.Exp/20
   nexp = 20*pvec
   Xsqvals = numeric(5e5)
   for (k in 1:5e5) {
      nvec = rmultinom(1,20, pvec)
      Xsqvals[k] = sum((nvec-nexp)^2/nexp) } 
   mean( Xsqvals > 8.398)
[1] 0.0869           ### this is the simulated p-value from 5e5 replications

### Next consider the LRT in place of Xsq for the grouped data, pretending 
###    that the 9.8 lambda was hypothesized in advance. 
### Remember that the grouping intervals were 0:4,5:8, 9:12, 13:16, 17:25
###    and the observed Kick.counts were  2,6,7,3,2
###    so the unrestricted multinomial MLEs were (2,6,7,3,2)/20
###    while the null-hypothesis cell probs were Kick.Exp/20

## The LRT is then
> 2*sum(as.numeric(Kick.counts)*log(as.numeric(Kick.counts)/Kick.Exp))
[1] 5.128041          ### this is not large at all for a 4df chi-square

## To check the p-value of this statistic by simulation:
>  LRTvals = numeric(5e5)
   for (k in 1:5e5) {
      nvec = rmultinom(1,20, pvec)
      LRTvals[k] = 2*sum(nvec*log(nvec/Kick.Exp), na.rm=T) } 
   mean( LRTvals > 5.128)
[1] 0.266764                         ### NB na.rm=T to make 0*log(0) = 0

### compare with 1-pchisq(5.128,4) = 0.2744

### So our conclusion based on comparing LRT=G^2 with X^2 and 
###   with the chi-square 4df reference distribution is that 
###   both of these statistics show empirical multinomial p-values 
###   close to their chi-square p-values, but that they are not 
###   close to one another! When that is the case, the LRT has 
###   the Wilks Thm to support it, but the Xsq has no separate justification.


3. DEALING HONESTLY WITH THE ESTIMATED POISSON PARAMETER -- BY GROUPED DATA

## Our tentative conclusion so far was that if the 9.8 mean were hypothesized
##  in advance, we would not reject the Poisson null hypothesis.

## But it WAS ESTIMATED. Let us first think how to do a fair hypothesis test
##  using count-data grouped into the intervals   0:4,5:8, 9:12, 13:16, 17:Inf  

> countvec = as.numeric(Kick.counts)
[1] 2 6 7 3 2

## Again we know the unrestricted cell-probs are c(2,6,7,3,2)/20
## The LRT tells us that if the data are grouped we do not estimate lambda
##    using the overall mean but rather using the likelihood maximizer for
##    Poisson-based cell probabilities with that data-grouping

## Here is a function that sums the Poisson probabilities over the grouped 
##   intervals  as a function of the unknown lambda = x and computes the 
##   negative log-likelihood, which we will minimize over x


> NlogLik = function(x, cvec=countvec) {       ### -loglik based on grouped data
     logpmat = array(0, c(length(x),5))
     for(j in 1:length(x)) logpmat[j,] = 
          diff(c(0,ppois(c(4,8,12,16,Inf),x[j])))
     -c(logpmat %*% cvec) }

  curve(NlogLik,0,20)       ### really nice function with clear min near 9

> optim(1, NlogLik, method="BFGS", lower=0.1,upper=20)
$par
[1] 8.821521

$value
[1] -5.819643

$counts 
function gradient 
      12       12 

$convergence
[1] 0

### So grouped-data restricted Poisson lambda MLE = 8.822, and 
### the G^2 statistic becomes

> RMLE.Exp = 20*diff(c(0,ppois(c(4,8,12,16,Inf),8.8215)))
> 2*sum(countvec*log(countvec/RMLE.Exp))
[1] 7.592756      ### now the degrees of freedom are 3, and p-val 0.0552

### This is really borderline, so let's check by simulation, where we 
###   simulate 20 Poisson's, group, find restricted MLE, compute LRT

>  LRTvals2 = numeric(5e4)
   for (k in 1:5e4) {
      xvec = rpois(20, 8.822) 
      nvec = as.numeric(table(cut(xvec, breaks=c(0,4,8,12,16,Inf))))
      lam.MLEr = optim(1, NlogLik, method="L-BFGS-B", lower=0.1,
           upper=20, cvec=nvec)$par
      rmle.exp = 20*diff(c(0,ppois(c(4,8,12,16,Inf),lam.MLEr)))
      LRTvals2[k] = 2*sum(nvec*log(nvec/rmle.exp), na.rm=T) } 
   mean( LRTvals2 > 7.593)
[1] 0.15252

>  LRTvals3 = numeric(5e4)
   for (k in 1:5e4) {
      xvec = rpois(20, 9.8) 
      nvec = as.numeric(table(cut(xvec, breaks=c(0,4,8,12,16,Inf))))
      lam.MLEr = optim(1, NlogLik, method="L-BFGS-B", lower=0.1,
           upper=20, cvec=nvec)$par
      rmle.exp = 20*diff(c(0,ppois(c(4,8,12,16,Inf),lam.MLEr)))
      LRTvals3[k] = 2*sum(nvec*log(nvec/rmle.exp), na.rm=T) } 
   mean( LRTvals3 > 7.593)
[1] 0.1401

### The p-value is quite properly smaller than in the case where we did
###    not treat lambda as estimated. Here we found via simulation that 
###    the p-value is not well approximated by the chi-sq 3df. Our 
###    conclusion based on the grouped data is that the Poisson 
###    distribution is still  an adequate description of the data.


4. TESTING POISSON GOODNESS OF FIT WITHOUT GROUPING ?

### We might try to apply LRT to the 20 tabulated observations with no 
###    grouping at all, using counts in the range 0:25  (which seems 
###    reasonable because  1-ppois(25,9.8) = 1.3e-5.

## This might be reasonable if the sample size were really large, but 
##  most of the cells have very small probability [displayed above] ,
##  so  Xsq  is not usable because many cells would have very small
##  "expected" values in denominators. We could still try G^2, however.

> Kicks
yeartots
 3  4  5  6  7  8  9 10 11 12 14 15 17 18 
 1  1  2  2  1  1  2  1  3  1  1  2  1  1 

### The LRT in that problem has restricted MLE for lambda = mean = 9.8, and 
###   the LRT becomes 
> 2*sum(as.numeric(Kicks)*log(as.numeric(Kicks)/
           (20*dpois(as.numeric(names(Kicks)),9.8))))
[1] 17.00995

## The degrees of freedom would be something like 25, so this is very 
##   moderate. Even if we viewed the relevant cells as 2:18, the 
##   degrees of freedom would still be 16 and we would still accept 
##   the Poisson null hypothesis.

NB. Goodness of fit testing in a distributional setting would often be 
done against a specific alternative (like Negative Binomial) with 
"dispersion", and we will talk about that approach later in the course.


5. ADDITIONAL STRUCTURE IN THE DATA
 
We began with 

> yeartots
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 
 3  5  7  9 10 18  6 14 11  9  5 11 15  6 11 17 12 15  8  4 

Under the Poisson(lambda) assumption, these are all Poisson(9.8)
variates. One could look for structure such as time-trend. We could 
group in 4 successive 5-year batches

> t(array(as.numeric(yeartots),c(5,4)))
     [,1] [,2] [,3] [,4] [,5]
[1,]    3    5    7    9   10
[2,]   18    6   14   11    9
[3,]    5   11   15    6   11
[4,]   17   12   15    8    4

> apply(.Last.value,1,sum)
[1] 34 58 48 56               ### the totals of the 5-year row batches

### But time-trend is not indicated. We could also look back at the 
###  original tabulated data by army corps

> corp.tots = apply(horskarr,2,sum)
  corp.tots
   G    I   II  III   IV   IX    V   VI  VII VIII    X   XI  XIV 
  16   16   12   12    8   11   17   12    7   13   15   25   24 
  XV 
   8 

## rates of kicks per year by corp range from .35 to 1.25

and here it does seem likely that the different corps had different 
rates but the rates might be proportional to the size of the corp. 
The different rates among corps seems clear enough that a hypothesis 
test would not be necessary: but here is code & output "confirming" it:

>  chisq.test( corp.tots, correct=F )

        Chi-squared test for given probabilities

data:  corp.tots
X-squared = 27.286, df = 13, p-value = 0.01137

###########################
There is also the possibility of looking for structure in the 
cross-classified  year x corp  dataset that we started with. This 
allows separate corp and year "effects" but tests for row-column 
interaction:

> dim(horskarr)       ### the number of null hypothesis "parameters"
[1] 20 14              ###   is  20+14-1 = 33  in the table of 280 items
                     ###   if the table total is regarded as random outcome
                       ### so the degrees of freedom is 280-33 = 247
      ## NB if the table total is regarded as fixed, there are 279 free
      ##    multinomial prob's in the unrestricted case, but only
      ##    (20-1)+(14-1) = 32 in the restricted (row-column indep.) model 

> chisq.test(horskarr) 

        Pearson's Chi-squared test

data:  horskarr
X-squared = 239.77, df = 247, p-value = 0.6173

### So there is no evidence of year by corp interaction in the counts

On could ask many "model" related questions about the effects of 
year (as a single predictor) and corp (as a set of dummy variables, 
one parameter for each corp) within a regression-type model for the 
log of rate applicable to the 280 year-by-corp combinations.

### We might think that the "horsk" array entries were indep Poisson with rates 
###   of the form   a(corp)*b(year), then the "Poisson regression" of y on 
###   year and corp (without interaction) (with log(rate) showing additive 
###   non-interacting effects from year and corp) should be a good model.

----- THIS PART COULD BE SKIPPED FOR NOW: WE COVER POISSON REGRESSION LATER

> yreg = glm(y ~ year + corp, data=horsek, family=poisson)

> anova(yreg)
Analysis of Deviance Table

Model: poisson, link: log
Response: y
Terms added sequentially (first to last)

     Df Deviance Resid. Df Resid. Dev
NULL                   279     323.23
year  1   2.2866       278     320.94
corp 13  26.1369       265     294.81   ### 

> qchisq(0.95,13)
[1] 22.36203
> 1-pchisq(26.137,13)
[1] 0.0162949

### This indicated that the year effects are not very strong but the corp effects are:
> round(summary(yreg)$coef,4)
            Estimate Std. Error z value Pr(>|z|)
(Intercept)  -1.8146     1.0873 -1.6689   0.0951
year          0.0188     0.0124  1.5095   0.1312
corpI         0.0000     0.3535  0.0000   1.0000
corpII       -0.2877     0.3819 -0.7534   0.4512
corpIII      -0.2877     0.3819 -0.7533   0.4512
corpIV       -0.6931     0.4330 -1.6008   0.1094
corpIX       -0.2076     0.3734 -0.5561   0.5781
corpV        -0.3747     0.3917 -0.9567   0.3387
corpVI        0.0606     0.3483  0.1741   0.8618
corpVII      -0.2877     0.3819 -0.7533   0.4512
corpVIII     -0.8267     0.4532 -1.8243   0.0681
corpX        -0.0645     0.3594 -0.1796   0.8575
corpXI        0.4463     0.3202  1.3940   0.1633
corpXIV       0.4055     0.3227  1.2563   0.2090
corpXV       -0.6931     0.4330 -1.6009   0.1094

### indicates that we have spent (far) too many parameters on this 
### model, but an overall residual deviance of 295 on 265 df 
### indicates an adequate fit!

> 1-pchisq(323,279)
[1] [1] 0.0358712
> 1-pchisq(295,265)
[1] 0.09928192


6. TESTING A POISSON ASSUMPTION WITH CORP-SPECIFIC RATES

## Here we use the year-by-corps cross-classified data, 280 items

> table(c(horskarr))

  0   1   2   3   4 
144  91  32  11   2          ### 196 deaths total

### How to test a Poisson assumption with corps-specific rates ?
### Seems clear that the grouping needed would be 0,1,2,3 and 4+
###   or better still 0,1,2,3+

### So now the data are viewed as 14 vectors of 20 4-valued 
###   categorical outcomes each

## Start by creating an array of grouped data, essentially by 
##   replacing 4-counts by 3's and viewing the 3's as 3+, and 
##   then tabulating by group.

> tmp = array(pmin(3,horskarr), c(20,14), 
                    dimnames=dimnames(horskarr))
  horskarr2 = t(apply(tmp,2,function(col) 
             as.numeric(table(factor(col, levels=0:3)))))

### need to do it this way because not all corps have 
###    the same set of 0:3 outcomes.
### Here is the resulting 14x4 array dataset

> horskarr2
      0 1 2 3+
G     9 7 3  1
I    11 4 3  2
II   12 4 4  0
III  11 6 3  0
IV   12 8 0  0
IX   10 9 1  0
V     9 7 2  2
VI   11 6 3  0
VII  13 7 0  0
VIII 10 7 3  0
X    10 6 3  1
XI    6 8 2  4
XIV   6 8 3  3
XV   14 4 2  0

### This is a multinomial dataset in each row, and we want 
###   an overall test of the adequacy of the null hypothesis 
###   that all of these corps outcomes look Poisson, with 
###   each corps having its own Poisson rate.

### so the unrestricted model has 14*3 free parameters, with MLEs
###   as the multinomial counts divided by row sums of 20

## the restricted model estimates of the cell probabilities 
## in each row are dpois(k,x) for k=0,1,2 
##     and x=the Poisson paramter for that row.
## (and then the 3+ cell prob in the same row is forced equal 
##     to 1 minus the sum of the others, or 1-ppois(2,x).

## so now we get the restricted MLEs by solving for the x's
## as grouped.data MLEs in each row
###   based on the corp-specific likelihood max
###   with shared Poisson parameter in 0,1,2,3+ grouped data:

> NLLik.C = function(x, ccounts) {
        ##   ccounts are the numbers of 0,1,2,3+ among the
        ##   20 years' deaths for specific corp
        out = numeric(length(x))
        for (k in 1:length(x))   out[k] = sum( ccounts*
                log(c(dpois(0:2,x[k]),1-ppois(2,x[k]))) )
        -out  } 

> curve(NLLik.C(x, horskarr2[2,]), 0.001, 3)
      ### this shows a typical neg-loglik to be minimized (for corp "I")

> MLEr.corp = numeric(14)
  for(k in 1:14) MLEr.corp[k] = optim(1, NLLik.C, method="L-BFGS-B", 
      lower=0.01, upper=3, ccounts=horskarr2[k,])$par  
> round(MLEr.corp,4)
 [1] 0.8115 0.8233 0.6000 0.6000 0.4000 0.5500 0.8750 0.6000 0.3500 0.6500
[11] 0.7607 1.2774 1.2041 0.4000
  ### Compare these restricted MLEs with the corp-specific means
> round(as.numeric(corp.tots)/20),4)
 [1] 0.80 0.80 0.60 0.60 0.40 0.55 0.85 0.60 0.35 0.65 0.75 1.25 1.20 0.40

   ### so they are very close!! amy but not all are the same.

## Now we are ready to form our LRT: 14*3 unrestricted parameters 
##   versus 14 restricted ones, so overall degrees of freedom 14*3-14=28

> horskarr2.exp = array(0, c(14,4), dimnames(horskarr2))
  for(k in 1:14) horskarr2.exp[k,] = 20*
      c(dpois(0:2,MLEr.corp[k]), 1-ppois(2,MLEr.corp[k]) )

> cbind(Exp=round(horskarr2.exp,2), Obs=horskarr2)
         0    1    2   3+  0 1 2 3+
G     8.88 7.21 2.92 0.98  9 7 3  1
I     8.78 7.23 2.98 1.02 11 4 3  2
II   10.98 6.59 1.98 0.46 12 4 4  0
III  10.98 6.59 1.98 0.46 11 6 3  0
IV   13.41 5.36 1.07 0.16 12 8 0  0
IX   11.54 6.35 1.75 0.37 10 9 1  0
V     8.34 7.30 3.19 1.18  9 7 2  2
VI   10.98 6.59 1.98 0.46 11 6 3  0
VII  14.09 4.93 0.86 0.11 13 7 0  0
VIII 10.44 6.79 2.21 0.57 10 7 3  0
X     9.35 7.11 2.70 0.84 10 6 3  1
XI    5.58 7.12 4.55 2.75  6 8 2  4
XIV   6.00 7.22 4.35 2.43  6 8 3  3
XV   13.41 5.36 1.07 0.16 14 4 2  0

### This looks pretty good for the Poisson grouped-data model,
###   and the resulting 28 df LRT is

> 2*sum(c(horskarr2*log(horskarr2/horskarr2.exp)), na.rm=T)
[1] 25.74281
              ### so the p-value is  
> 1-pchisq(25.74, 28)
[1] 0.5873336           ### and we happily accept