R Scripts for EM Algorithm Examples on Handout  NeEMhandoutsS19.pdf
===================================================================

E. Slud                                               2/18/2019

Example I. Contingency Table                                     

NegLlkCom =  function(theta, Xmat) {
   ### negative complete-data log-likelihood, where Xmat is 2x3
   ###    as a function of theta=(p1,p2,alpha)
   ###    where p_3 and gamma are filled in using the constraints
   ### 
       p1=theta[1]; p2=theta[2]; p3=1-p1-p2
       alpha=theta[3]; gamma = (1-alpha*p1)/(2-p1)
       Xcol = X[1,]+X[2,]
       -( X[1,1]*log(alpha/gamma) + sum(Xcol*log(c(p1,p2,p3))) 
          + sum(c(Xmat))*log(gamma) )  }


NegLlkObs1 =  function(theta, Xvec) {
   ### negative obs-data log-likelihood as fcn of theta=(p1,p2,alpha)
   ###    where p_3, gamma are filled in using constraints, and
   ###    where Xvec is 5-vector of X11,X12,X13,X21,X22+X23
       p1=theta[1]; p2=theta[2]; p3=1-p1-p2
       alpha=theta[3]; gamma = (1-alpha*p1)/(2-p1)
       -( Xvec[1]*log(alpha/gamma) + sum(Xvec)*log(gamma) +
         sum(c(Xvec[1]+Xvec[4],Xvec[2],Xvec[3])*log(c(p1,p2,p3))) +
         Xvec[5]*log(1-p1) )  }

MLEcom = function(Xmat) {
       N = sum(c(Xmat))
       aux = N-Xmat[1,1]+Xmat[2,1]
       p1 = 2*Xmat[2,1]/aux
       c(p1=p1, p2=(Xmat[1,2]+Xmat[2,2])/aux, alpha= Xmat[1,1]/(N*p1)) }

XEMstep = function(Xvec,pvec) {
       N = sum(c(Xvec))
       aux = N-Xvec[1]+Xvec[4]
       pvec[2] = (Xvec[2]+pvec[2]*Xvec[5]/(1-pvec[1]))/aux
       pvec[2]   }

### Now input the Table given on p.3 of handout for this "obs1" data case.

Xvec0 = c(30,25,45,50,58)
Xmat0 = rbind(Xvec0[1:3], c(Xvec0[4],0.5*Xvec0[5]*c(1,1)))

## initialize at 
> pvec0 = c(50/108, (58/108)*c(25,45)/70)
> pvec0
[1] 0.4629630 0.1917989 0.3452381
> gam0 = 108/208
> alp0 = (1-(2-pvec0[1])*gam0)/pvec0[1]
> alp0
[1] 0.4361538
> rbind(c(alp0*pvec0[1], gam0*pvec0[2:3]),gam0*pvec0)*208
     [,1]     [,2]     [,3]
[1,]   42 20.71429 37.28571
[2,]   50 20.71429 37.28571       ### initial guess, logLik= -366.77

> MLobs1 = nlm(NegLlkObs1, c(pvec0[1:2],alp0), , hessian=T, Xvec=Xvec0)
$minimum
[1] 325.8568                     ### maximized logLik = -325.86

$estimate
[1] 0.4385959 0.2005010 0.3288463

$gradient
[1] 3.416289e-05 4.877165e-05 2.125944e-05

$code
[1] 1

$iterations
[1] 8

### Fitted cell probabilities
> pfit1 = c(MLobs1$est[1:2], 1-sum(MLobs1$est[1:2]))
  gfit1 = (1-MLobs1$est[3]*pfit1[1])/(2-pfit1[1])
  Xfit1 = matrix(c(MLobs1$est[3]*pfit1[1], 
         gfit1*c(pfit1[-1],pfit1)), byrow=T, nrow=2)*208

  Xfit1 
         [,1]    [,2]     [,3]
[1,] 29.99996 22.8571 41.14293
[2,] 49.99997 22.8571 41.14293      ### final fit, OK

### Now let's check the final answer against formulas, 
###

c(pfit1[1], 2*Xvec0[4]/(sum(Xvec0)-Xvec0[1]+Xvec0[4]))   ### OK
[1] 0.4385962 0.4385965
c(pfit1[2], Xvec0[2]*(sum(Xvec0)-Xvec0[1]-Xvec0[4])/
       ((sum(Xvec0)-Xvec0[1]+Xvec0[4])*sum(Xvec0[2:3])))
[1] 0.2005009 0.2005013                                  ### OK

### Now let's calculate the EM iteration, starting from Xvec0

pvec = c(pfit1[1],0.5*c(1,1)*(1-pfit1[1]))
for(j in 1:15) {
   pvec[2] = XEMstep(Xvec0,pvec)
   cat("iter ",j,"; p2=",round(pvec[2],6),"\n") }
iter  1 ; p2= 0.236842 
iter  2 ; p2= 0.216968 
iter  3 ; p2= 0.207963 
iter  4 ; p2= 0.203882 
iter  5 ; p2= 0.202033 
iter  6 ; p2= 0.201195 
iter  7 ; p2= 0.200816 
iter  8 ; p2= 0.200644 
iter  9 ; p2= 0.200566 
iter  10 ; p2= 0.20053 
iter  11 ; p2= 0.200514 
iter  12 ; p2= 0.200507 
iter  13 ; p2= 0.200504 
iter  14 ; p2= 0.200502 
iter  15 ; p2= 0.200502  
   ### This confirms convergence after not too many iterations to the
  ### same p2 value found (in pfit[2]) by maximizing L_obs directly.

#------------------
## Continue by defining log L_{obs2} for second case of observed data
##    with X_{21}+X_{22} observed in place of X_{21}, X_{22}

NegLlkObs2 =  function(theta, Xvec) {
   ### negative obs-data log-likelihood as fcn of theta=(p1,p2,alpha)
   ###    where p_3, gamma are filled in using constraints, and
   ###    where Xvec is 5-vector of X11,X12,X13,X21+X22,X23
       p1=theta[1]; p2=theta[2]; p3=1-p1-p2
       alpha=theta[3]; gamma = (1-alpha*p1)/(2-p1)
       -( Xvec[1]*log(alpha/gamma) + sum(Xvec)*log(gamma) +
         sum(c(Xvec[1:2],Xvec[3]+Xvec[5])*log(c(p1,p2,p3))) +
         Xvec[4]*log(1-p3) )  }

Xvec2 = c(30,25,45,71,37)
MLE2 = MLEcom(rbind(c(30,25,45), c(50,21,37)))
round(MLE2,4)
    p1     p2  alpha 
0.4386 0.2018 0.3288 

## So start by numerically maximizing L_{obs2} directly
> MLobs2 = nlm(NegLlkObs2, MLobs1$est, hessian=T, Xvec=Xvec2)
$minimum
[1] 320.5369                   #### maximized logLik_{obs2} = -320.54

$estimate
[1] 0.4107145 0.2232137 0.3511699

$gradient
[1] 1.506351e-04 6.343726e-05 1.401190e-04

$code
[1] 1

$iterations
[1] 9

### Next do it via EM: start with p1=.2,p2=.2,alpha=.5
  thet = c(.2,.2,.5)
  Xmat = rbind(Xvec2[1:3],c(c(.5,.5)*Xvec2[4],Xvec2[5]))
  -NegLlkObs2(thet,Xvec2)                     ### initial logL_{obs2}=-339.638

> for(j in 1:20) {
  Xmat[2,1:2] = Xvec2[4]*thet[1:2]/sum(thet[1:2])
  thet = MLEcom(Xmat)
  cat("iter ",j,"; pfit=",round(c(thet[1:2],1-sum(thet[1:2])),4),
       "; logL_{obs2}=",-NegLlkObs2(thet,Xvec2),"\n") }

iter  1 ; pfit= 0.3326 0.2834 0.3841 ; logL_{obs2}= -321.2256 
iter  2 ; pfit= 0.3544 0.2666 0.379 ; logL_{obs2}= -320.9111 
iter  3 ; pfit= 0.3709 0.2539 0.3752 ; logL_{obs2}= -320.7311 
iter  4 ; pfit= 0.3829 0.2446 0.3725 ; logL_{obs2}= -320.6341 
iter  5 ; pfit= 0.3915 0.238 0.3705 ; logL_{obs2}= -320.5843 
iter  6 ; pfit= 0.3975 0.2334 0.3691 ; logL_{obs2}= -320.5596 
iter  7 ; pfit= 0.4017 0.2302 0.3681 ; logL_{obs2}= -320.5476 
iter  8 ; pfit= 0.4046 0.2279 0.3675 ; logL_{obs2}= -320.5419 
iter  9 ; pfit= 0.4065 0.2264 0.367 ; logL_{obs2}= -320.5392 
iter  10 ; pfit= 0.4079 0.2254 0.3667 ; logL_{obs2}= -320.538 
iter  11 ; pfit= 0.4088 0.2247 0.3665 ; logL_{obs2}= -320.5374 
iter  12 ; pfit= 0.4094 0.2242 0.3664 ; logL_{obs2}= -320.5371 
iter  13 ; pfit= 0.4098 0.2239 0.3663 ; logL_{obs2}= -320.537 
iter  14 ; pfit= 0.4101 0.2237 0.3662 ; logL_{obs2}= -320.537 
iter  15 ; pfit= 0.4103 0.2235 0.3662 ; logL_{obs2}= -320.5369 
iter  16 ; pfit= 0.4104 0.2234 0.3661 ; logL_{obs2}= -320.5369 
iter  17 ; pfit= 0.4105 0.2234 0.3661 ; logL_{obs2}= -320.5369 
iter  18 ; pfit= 0.4106 0.2233 0.3661 ; logL_{obs2}= -320.5369 
iter  19 ; pfit= 0.4106 0.2233 0.3661 ; logL_{obs2}= -320.5369 
iter  20 ; pfit= 0.4107 0.2233 0.3661 ; logL_{obs2}= -320.5369
### Even after these 20 iterations we have not quite converged, but almost.

##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
## Up to this point we have direct and EM estimates of the observed-data MLEs for two
##  different versions of the observed data from the 2x3 "complete data" contingency 
##  table. One of those was for the data in problem part I(c) within HW2, ie for the 
##  data with likelihood L_{obs,1} where  X_{22}+X_{23} is observed but not X_{22} 
##  and X_{23} separately.
##
## The other version of the data, with likelihood L_{obs,2} corresponding to X_{21}+X_{22}
## but not X_{12} and X_{22} separately being observable, was also treated in the script 
## above. In particular, the estimates for (p1,p2,alpha) were given in MLobs2$est by 
## direct numerical maximization of log L_{obs,2} and in parameter "thet" above via EM.

## To complete our script, we compare the observed-data Fisher information (for the 
## whole sample, not per-observation)  I_{obs,2} which we computed (but did not yet display)
## in MLobs2$hessian, but recall that the order of parameter components in the R script 
was p1,p2,alpha, so the components must be reordered to give:

> round( MLobs2$hessian[c(3,1:2), c(3,1:2)], 2)
       [,1]   [,2]    [,3]
[1,] 284.13 243.08    0.00
[2,] 243.08 926.11  788.86
[3,]   0.00 788.86 1290.17

## The EM-algorithm version of the same information matrix is obtained through 
## Louis' (1982) formula (2.2) of Theorem 2.7 in the Kim-Shao book.
## (YOU ARE ASKED IN HW2 PROBLEM I(c) TO DO AN ANALOGOUS CALCULATION FOR I_{obs,1} 
## BASED ON on L_{obs,1}.)

## The formula (2.22) (expanded to an estimate of information based on the whole 
## observed data-sample) requires calculation of three matrices shown carefully in 
## the Handout  EMinfo-examp.pdf  linked to the web-page under the EM handouts (1).
## The numerical computations and comparisons follow: 

## Recall Xvec2 = c(30,25,45,71,37)  are resp. values for X11, X12, X13, X21+X22, X23
##    and thet[c(3,1:2)] is the R variable containing  (alpha,p1,p2)

> alpha=thet[3]; p1=thet[1]; p2=thet[2]
> thet[c(3,1:2)]
    alpha        p1        p2 
0.3512191 0.4106575 0.2232580 

## NB as a rough check on the model, gamma SHOULD BE ESTIMATED BOTH BY

> as.vector(c(sum(Xvec2[4:5]/208), (1-alpha*p1)/(2-p1)))
[1] 0.5192308 0.5384423                                  #### Not perfect ,but OK 

> Xp3 = Xvec2[3]+Xvec2[5]

> ObsInfo2 = array(
    c(Xvec2[1]/alpha^2+(208-Xvec2[1])*p1^2/(1-alpha*p1)^2, 
    (208-Xvec2[1])/(1-alpha*p1)^2, 0, (208-Xvec2[1])/(1-alpha*p1)^2, 
    Xvec2[1]/p1^2+Xvec2[4]/(p1*(p1+p2))+Xp3/(1-p1-p2)^2+
        (208-Xvec2[1])*(alpha^2/(1-alpha*p1)^2-1/(2-p1)^2)-Xvec2[4]*p2/(p1*(p1+p2)^2),
    Xp3/(1-p1-p2)^2+Xvec2[4]/(p1+p2)^2, 0, 
    Xp3/(1-p1-p2)^2+Xvec2[3]/(p1+p2)^2, 
    Xvec2[2]/p2^2+Xvec2[4]/(p2*(p1+p2))+Xp3/(1-p1-p2)^2 -
        Xvec2[4]*p1/(p2*(p1+p2)^2)), c(3,3))

> ObsInfo2
         [,1]     [,2]      [,3]
[1,] 284.1896 243.0562    0.0000
[2,] 243.0562 925.9515  723.8411
[3,]   0.0000 788.5421 1290.1056

## Compare this once again with 
> round(MLobs2$hess[c(3,1:2),c(3,1:2)],2)
       [,1]   [,2]    [,3]
[1,] 284.13 243.08    0.00
[2,] 243.08 926.11  788.86
[3,]   0.00 788.86 1290.17



### OK THIS MATRIX IS NOW VERY CLOSE TO THE ONE OBTAINED BY DIRECT LOGLIK_{OBS} 
###    MAXIMIZATION, as it must be because the Louis formula (2.23) is an algebraic
###    identical re-expression of the observed-data observed information !!!
### The only discrepancy is due to the difference between the EM-estimated (and not 
###    fully converged) thet estimator and the direct numerical max MLobs2$est.



Example 2. Unbalanced ANOVA                                      2/25/2019
---------------------------

## First simulate moderately large unbalanced-ANOVA dataset, m=30

nvec = sample(10:30, 30, replace=T)  ### sum(nvec) = N = 600
thet0 = c(mu=4, siga=0.7, sige=0.4)
avec = rnorm(30, 0, thet0[2])
yvec = thet0[1] + rep(avec, nvec) + rnorm(600,0,thet0[3])

> Xbarj = aggregate(yvec, by=list(rep(1:30,nvec)), mean)$x

## NB in this example, for each j, (X_{ij})_i ~ MVN with variance the sum pof rank-1 plus diagonal matrix
## The log-likelihood is slightly messy.

NLogLikANOVA = function(thet, Yv, nv) {
      m = length(nv)
      N = length(Yv)
      (N-m)*log(thet[3]) + 0.5*sum(log(nv*thet[2]^2+thet[3]^2)) + sum((Yv-thet[1])^2)/(2*thet[3]^2) -
              (0.5*(thet[2]/thet[3])^2)*sum(nv^2*(Xbarj-thet[1])^2/(thet[3]^2+nv*thet[2]^2)) }

## First we find the true observed-data logLik

as.vector(-NLogLikANOVA(thet0,yvec,nvec))   ### =   181.9727    (without the 2*pi's)

## Next the true MLE and hessian

MLanova = nlm(NLogLikANOVA, c(3,1,1), hessian=T, Yv=yvec, nv=nvec)
                                             ### initial neg loglik = 116.96
> MLanova
$minimum
[1] -182.6577

$estimate
[1] 4.1403027 0.6824724 0.4045956            ### this is the MLE

$gradient
[1] 9.610504e-06 3.842615e-05 7.435119e-05

$hessian
            [,1]         [,2]         [,3]
[1,] 63.19978860  -0.08051131 7.178222e-02
[2,] -0.08051131 123.26335082 5.141283e+00
[3,]  0.07178222   5.14128260 6.953698e+03

$code
[1] 1

$iterations
[1] 14

### NOTE that mu and sige are well estimated, while siga is estimated less precisely

> round(sqrt(diag(solve(MLanova$hess))),3)     ### standard errors
[1] [1] 0.126 0.090 0.012

   
### Next the EM which requires no log-likelihoods:

EMstep = function(th.in, Yv, nv) {
      N = length(Yv)
      gam.in = th.in[2]^2/(th.in[2]^2+th.in[3]^2/nv)
      aux = gam.in*(Xbarj-th.in[1])
      mu.out = sum(nv*(Xbarj-aux))/N
      c(mu.out, sqrt(mean(aux^2 + th.in[3]^2*(gam.in/nv))), 
        sqrt(th.in[3]^2*sum(gam.in)/N + mean((Yv-mu.out- rep(aux,nv))^2)) )  }

th = c(3,1,1)
for(i in 1:600) {
    th = EMstep(th,yvec,nvec)
    llik = -NLogLikANOVA(th, yvec, nvec)
    if(i%%20 == 0) cat("iter ",i,"; theta=",round(th,4),"; logL=",llik,"\n") }

iter  20 ; theta= 3.1587 1.201 0.4045 ; logL= 166.044 
iter  40 ; theta= 3.2713 1.1103 0.4045 ; logL= 168.3888 
iter  60 ; theta= 3.3872 1.0215 0.4045 ; logL= 170.8746 
iter  80 ; theta= 3.5046 0.9374 0.4045 ; logL= 173.4253 
iter  100 ; theta= 3.6203 0.8621 0.4045 ; logL= 175.8986 
iter  120 ; theta= 3.7295 0.7998 0.4045 ; logL= 178.0955 
iter  140 ; theta= 3.8266 0.7533 0.4046 ; logL= 179.8316 
iter  160 ; theta= 3.9075 0.7224 0.4046 ; logL= 181.0365 
iter  180 ; theta= 3.9712 0.7039 0.4046 ; logL= 181.7797 
iter  200 ; theta= 4.019 0.6936 0.4046 ; logL= 182.1996 
iter  220 ; theta= 4.0539 0.6881 0.4046 ; logL= 182.4238 
iter  240 ; theta= 4.079 0.6853 0.4046 ; logL= 182.5396 
iter  260 ; theta= 4.097 0.6839 0.4046 ; logL= 182.5985 
iter  280 ; theta= 4.1097 0.6832 0.4046 ; logL= 182.6281 
iter  300 ; theta= 4.1187 0.6828 0.4046 ; logL= 182.6429 
iter  320 ; theta= 4.125 0.6826 0.4046 ; logL= 182.6503 
iter  340 ; theta= 4.1295 0.6826 0.4046 ; logL= 182.654 
iter  360 ; theta= 4.1327 0.6825 0.4046 ; logL= 182.6559 
iter  380 ; theta= 4.1349 0.6825 0.4046 ; logL= 182.6568 
iter  400 ; theta= 4.1365 0.6825 0.4046 ; logL= 182.6573 
iter  420 ; theta= 4.1376 0.6825 0.4046 ; logL= 182.6575 
iter  440 ; theta= 4.1384 0.6825 0.4046 ; logL= 182.6576 
iter  460 ; theta= 4.139 0.6825 0.4046 ; logL= 182.6577 
iter  480 ; theta= 4.1394 0.6825 0.4046 ; logL= 182.6577 
iter  500 ; theta= 4.1396 0.6825 0.4046 ; logL= 182.6577 
iter  520 ; theta= 4.1398 0.6825 0.4046 ; logL= 182.6577 
iter  540 ; theta= 4.14 0.6825 0.4046 ; logL= 182.6577 
iter  560 ; theta= 4.1401 0.6825 0.4046 ; logL= 182.6577 
iter  580 ; theta= 4.1401 0.6825 0.4046 ; logL= 182.6577 
iter  600 ; theta= 4.1402 0.6825 0.4046 ; logL= 182.6577 
                 #### Note the slow convergence !!!

### Now again compare the direct negative Hessian for the observed-data logLik
###   versus the Louis formula value:   ...  (in class as time permits)