HW12 calculations/solution
==========================

set.seed(7799)
G = sample(1:2,500, replace=T, prob=c(.3,.7))
> table(G)
G
  1   2 
161 339 

Xdat = rnorm(500, 10, 1+G)

> c(mean(Xdat[G==1]), sd(Xdat[G==1]))
[1] 9.793693 1.858164
> c(mean(Xdat[G==2]), sd(Xdat[G==2]))
[1] 9.965421 2.821149                     ### OK

> NlogLikmix = function(thet, Xv) 
             -sum(log( thet[1]*dnorm(Xv,thet[2],thet[3]) +
                   (1-thet[1])*dnorm(Xv,thet[2],thet[4])))
> fit0 = optim(c(.4, 12, 2, 4), NlogLikmix, Xv=Xdat, 
             method="L-BFGS-B", lower=c(0,-Inf,0,0),
             upper=c(1,Inf,Inf,Inf), hessian=T)
$par
[1] 0.4512559 9.9146052 1.8243200 3.0154550

$value
[1] 1174.248

$counts
function gradient 
      24       24 

$convergence
[1] 0

$message
[1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

$hessian
            [,1]       [,2]      [,3]      [,4]
[1,] 183.1243673  0.5915709 -64.23231 -81.36597
[2,]   0.5915709 80.8670006   1.07095  -0.36976
[3,] -64.2323088  1.0709500  39.68650  17.89200
[4,] -81.3659742 -0.3697600  17.89200  52.15001

#### OK, this is a reasonably accurate MLE !!! But the maximizer did not always reach the
####    best (global) MLE with "optim" if the starting values were chosen badly:

> fit1 = optim(c(.5, 15, 3, 5), NlogLikmix, Xv=Xdat, 
             method="L-BFGS-B", lower=c(0,-Inf,0,0),
             upper=c(1,Inf,Inf,Inf), hessian=T)
> fit1$par
[1] 1.000000 9.910126 2.547849 5.944331
> NlogLikmix(fit1$par,Xdat)
[1] 1177.094

> fit2 = optim(c(.5, 15, 3, 5), NlogLikmix, Xv=Xdat, 
             method="L-BFGS-B", lower=c(1e-2,-Inf,1e-2,1e-2),
             upper=c(0.9,Inf,Inf,Inf), hessian=T)
$par
[1] 0.4512405 9.9145842 1.8243270 3.0154503

$value
[1] 1174.248

$counts
function gradient 
      35       35 

$convergence
[1] 0

$message
[1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

$hessian
            [,1]       [,2]      [,3]        [,4]
[1,] 183.1163140  0.5901113 -64.22600 -81.3672253
[2,]   0.5901113 80.8659469   1.07149  -0.3691935
[3,] -64.2260030  1.0714897  39.68351  17.8917842
[4,] -81.3672253 -0.3691935  17.89178  52.1505592

### Also re-do the optimization using nlminb

fit3 = nlminb( c(.5, 15, 3, 5), NlogLikmix, Xv=Xdat, 
             lower=c(1e-2,-Inf,1e-2,1e-2), upper=c(.99,Inf,1e2,1e2))
$par
[1] 0.548748 9.914602 3.015436 1.824337

$objective
[1] 1174.248

$convergence
[1] 0

$iterations
[1] 21

$evaluations
function gradient 
      27       99 

$message
[1] "relative convergence (4)"

#### NOTE that this is exactly the same solution but with the two 
####     mixture components reversed ! To avoid this, we can re-parameterize
####     (and also do away with the "box"constraints).
####  We will make a new parameter  beta  with   beta[1] = qlogis(alph), 
###      beta[2] = mu,  beta[3] = log(sig1), beta[4] = log(sig2-sig1) = log(sig2-exp(beta[3])

> fit4 = nlm( function(beta,X)  NlogLikmix(c(plogis(beta[1]), beta[2], exp(beta[3]), 
              exp(beta[3])+exp(beta[4])),X), 
              c(0, 15, log(3), log(2)), X=Xdat, hessian=T)

> fit4
$minimum
[1] 1174.248

$estimate
[1] -0.1956491  9.9145933  0.6012097  0.1748806

$gradient
[1] -1.705303e-05 -2.965267e-05  9.276846e-05  3.137757e-05

$hessian
            [,1]       [,2]       [,3]        [,4]
[1,]  11.2289172  0.1542397 -65.768245 -23.9999736
[2,]   0.1542397 80.8666378   1.230632  -0.4561099
[3,] -65.7682449  1.2306317 424.749351 152.1803824
[4,] -23.9999736 -0.4561099 152.180382  73.9975803

$code
[1] 1

$iterations
[1] 34
> c(plogis(fit4$est[1]), fit4$est[2], exp(fit4$est[3]), exp(fit4$est[3])+exp(fit4$est[4]))
[1] 0.4512431 9.9145933 1.8243243 3.0154283

### OK, exactly the correct MLE, and this version of the optimization gives the same 
###  result from different starting points:

rbind(fit4$est, nlm( function(beta,X)  NlogLikmix(c(plogis(beta[1]), beta[2], exp(beta[3]), 
              exp(beta[3])+exp(beta[4])),X), 
              c(0, 15, log(3), log(2)), X=Xdat, hessian=T)$est)
           [,1]     [,2]      [,3]      [,4]
[1,] -0.1956491 9.914593 0.6012097 0.1748806
[2,] -0.1956491 9.914593 0.6012097 0.1748806          ####   34 iterations

### Now optimize with the EM algorithm using missing Y = G
### Note that f_{Y|X} = a*dnorm(x,mu,sig1)/( a*dnorm(x,mu,sig1) + (1-a)*dnorm(x,mu,sig2) )
### So now we do EM iteration by a closed-form iteration that can be derived by
### maximizing in the M step fist with respect to mu (in terms of sig1 and sig2) and 
### only then with respect to sig1 and sig2.

## --------------- Derivation of computing formulas

Let  

  Gi.st = alph0*dnorm(Xi,mu0,sig10)/(alph0*dnorm(Xi,mu0,sig10) + (1-alph0)*dnorm(Xi,mu0,sig20))
  n = length(Xdat)
  Gst = sum(Gi.st)
  S1 = sig1^2
  S2 = sig2^2
  th = (alph, mu, S1, S2)   ;    th0 = (alph0, mu0, S10, S20)

  mu1hat = sum_i Gi.st*Xi/ Gst
  mu2hat = sum_i (1-Gi.st)*Xi/ (n-Gi.st)
  B1   =  sum_i Gi.st*(Xi-mu1hat)^2/ Gst
  B2   =  sum_i (1-Gi.st)*(Xi-mu2hat)^2/ (n-Gst)
  ### All of these last 4 lines depend only on th0 and Xdat, not on th

  Then the quantity we want to maximize over th = (alph, mu, S1, S2) is the function

  Q(th, th0) = sum_i ( Gi.st*( log(alph) - 0.5*log(S1) - 0.5*(Xi-mu)^2/S1 )
                 + (1-Gi.st)*( log(1-alph) - 0.5*log(S2) - 0.5*(Xi-mu)^2/S2 ) )

  Maximization over alph directly leads to :      alph.hat = Gst/n

  Maximization over mu  in terms of  S1, S2  leads directly to:

           mu.hat = (mu1hat*Gst/S1 + mu2hat*(n-Gst)/S2)/(Gst/S1 + (n-Gst)/S2)     (*)

  and some algebra gives

          sum_i Gi.st*(Xi-mu)^2 =  sum_i Gi.st*(Xi-mu1hat + mu1hat-mu)^2 
                    = sum_i Gi.st*(Xi-mu1hat)^2 + Gst*(mu1hat-mu)^2 
                    = Gst*B1 + Gst*(mu1hat-mu)^2

  and similarly

        sum_i (1-Gi.st)*(Xi-mu)^2 =  sum_i (1-Gi.st)*(Xi-mu2hat + mu2hat-mu)^2 
                    = (n-Gst)*B2 + (n-Gst)*(mu2hat-mu)^2

  Maximizing Q over S1, S2 in terms of unknown mu  leads immediately to 

       S1 = sum_i Gi.st*(Xi-mu)^2/Gst   ,   S2 = sum_i (1-Gi.st)*(Xi-mu)^2/(n-Gst)   (**)

 So the simultaneous solution of (*) and (**) at mu=mu.hat will give the M-step solution.

 Alternatively, evaluating the expressions  (**) at   mu = mu.hat  gives

       -2*( Q((alph.hat, mu.hat, sig1, sig2), th0) - Gst*log(alph.hat) - (n-Gst)*log(1-alpha.hat) )

       =  Gst*log(S1) + (n-Gst)*log(S2) + (Gst/S1)*(B1 + (mu1hat-mu.hat)^2) 
                   + ((n-Gst)/S2)*(B2 + (mu2hat-mu)^2)

   This is the function we now want to minimize over S1, S2. Recalling that mu.hat depends 
   directly on  S1 and S2  and substituting into the last expression, leads to

      =  Gst*log(S1) + (Gst/S1)*B1 + (n-Gst)*log(S2) + (n-Gst)*B2/S2 
            + (mu1hat-mu2hat)^2*Gst*(n-Gst)/(Gst*S2+(n-Gst)*S1)

   We optimize this function by univariate minimization, since apparently the solution
   cannot be put into closed form.

###----------------------------------------------------------------------------------  

     EMiterHW12 = function(th.old,Xv) {
          Gstvec = th.old[1]*dnorm(Xv,th.old[2],th.old[3])/
              (th.old[1]*dnorm(Xv,th.old[2],th.old[3]) + 
               (1-th.old[1])*dnorm(Xv,th.old[2],th.old[4]))
          Gst = sum(Gstvec)
          n = length(Xv)
          ahat = Gst/n
          prof.mu = function(mu) Gst*log(sum(Gstvec*(Xv-mu)^2)) + 
                      (n-Gst)*log(sum((1-Gstvec)*(Xv-mu)^2))
          muhat = optimize(prof.mu,c(min(Xv),max(Xv)))$min
          sig1hat = sqrt(sum(Gstvec*(Xv-muhat)^2)/Gst)
          sig2hat = sqrt(sum((1-Gstvec)*(Xv-muhat)^2)/(n-Gst))
          est=c(ahat,muhat,sig1hat,sig2hat)
          list(est=est, logLik= -NlogLikmix(est,Xv)) }

> th0[1]=.45
  th0
[1]  0.45 15.00  3.00  5.00
> NlogLikmix(th0,Xdat)
[1] 1666.892

> EMiterHW12(th0, Xdat)
$est
[1] 0.3378309 9.9787437 2.4028841 2.6200749

$logLik
[1] -1177.092

> EMiterHW12(.Last.value$est, Xdat)
$est
[1] 0.3379799 9.9111869 2.3716008 2.6332832

$logLik
[1] -1176.833

> EMiterHW12(.Last.value$est, Xdat)
$est
[1] 0.3381985 9.9108433 2.3359833 2.6495843

$logLik
[1] -1176.723

> EMiterHW12(.Last.value$est, Xdat)
$est
[1] 0.3385183 9.9110684 2.2949813 2.6680003

$logLik
[1] -1176.577

> EMiterHW12(.Last.value$est, Xdat)
$est
[1] 0.3389761 9.9113353 2.2488269 2.6883178

$logLik
[1] -1176.391

> EMiterHW12(.Last.value$est, Xdat)
$est
[1] 0.3396115 9.9116212 2.1983043 2.7101080

$logLik
[1] -1176.167

> EMiterHW12(.Last.value$est, Xdat)
$est
[1] 0.3404595 9.9119041 2.1448010 2.7327401

$logLik
[1] -1175.912

#### So we can see we get relatively rapid initial convergence in EM, but
####   then it slows down if we want many-decimal-place accuracy in parameters.
#### To estimate iterates formally:

> for (i in 1:300) {
      tmp = EMiterHW12(th0,Xdat)
      th0 = tmp$est
      if( i%%15 == 0)  cat("At iter ",i,", ",round(th0,5),"   , logLik=",tmp$logLik,"\n") }
At iter  15 ,  0.43686 9.91483 1.80831 2.99846    , logLik= -1174.25 
At iter  30 ,  0.43839 9.9148 1.81001 3.00025    , logLik= -1174.249 
At iter  45 ,  0.43976 9.91477 1.81154 3.00185    , logLik= -1174.249 
At iter  60 ,  0.44098 9.91475 1.8129 3.00329    , logLik= -1174.249 
At iter  75 ,  0.44207 9.91473 1.81412 3.00457    , logLik= -1174.249 
At iter  90 ,  0.44304 9.91472 1.8152 3.00572    , logLik= -1174.248 
At iter  105 ,  0.44391 9.9147 1.81617 3.00675    , logLik= -1174.248 
At iter  120 ,  0.44469 9.91469 1.81704 3.00766    , logLik= -1174.248 
At iter  135 ,  0.44539 9.91468 1.81781 3.00849    , logLik= -1174.248 
At iter  150 ,  0.44601 9.91467 1.81851 3.00922    , logLik= -1174.248 
At iter  165 ,  0.44656 9.91466 1.81912 3.00988    , logLik= -1174.248 
At iter  180 ,  0.44706 9.91465 1.81968 3.01047    , logLik= -1174.248 
At iter  195 ,  0.44751 9.91464 1.82017 3.01099    , logLik= -1174.248 
At iter  210 ,  0.4479 9.91463 1.82061 3.01146    , logLik= -1174.248 
At iter  225 ,  0.44826 9.91463 1.82101 3.01188    , logLik= -1174.248 
At iter  240 ,  0.44857 9.91462 1.82136 3.01226    , logLik= -1174.248 
At iter  255 ,  0.44886 9.91462 1.82168 3.0126    , logLik= -1174.248 
At iter  270 ,  0.44911 9.91461 1.82196 3.0129    , logLik= -1174.248 
At iter  285 ,  0.44934 9.91461 1.82221 3.01317    , logLik= -1174.248 
At iter  300 ,  0.44954 9.91461 1.82244 3.01341    , logLik= -1174.248