R Scripts for EM Algorithm Examples on Handout NeEMhandoutsS19.pdf =================================================================== E. Slud 2/18/2019 Example I. Contingency Table NegLlkCom = function(theta, Xmat) { ### negative complete-data log-likelihood, where Xmat is 2x3 ### as a function of theta=(p1,p2,alpha) ### where p_3 and gamma are filled in using the constraints ### p1=theta[1]; p2=theta[2]; p3=1-p1-p2 alpha=theta[3]; gamma = (1-alpha*p1)/(2-p1) Xcol = X[1,]+X[2,] -( X[1,1]*log(alpha/gamma) + sum(Xcol*log(c(p1,p2,p3))) + sum(c(Xmat))*log(gamma) ) } NegLlkObs1 = function(theta, Xvec) { ### negative obs-data log-likelihood as fcn of theta=(p1,p2,alpha) ### where p_3, gamma are filled in using constraints, and ### where Xvec is 5-vector of X11,X12,X13,X21,X22+X23 p1=theta[1]; p2=theta[2]; p3=1-p1-p2 alpha=theta[3]; gamma = (1-alpha*p1)/(2-p1) -( Xvec[1]*log(alpha/gamma) + sum(Xvec)*log(gamma) + sum(c(Xvec[1]+Xvec[4],Xvec[2],Xvec[3])*log(c(p1,p2,p3))) + Xvec[5]*log(1-p1) ) } MLEcom = function(Xmat) { N = sum(c(Xmat)) aux = N-Xmat[1,1]+Xmat[2,1] p1 = 2*Xmat[2,1]/aux c(p1=p1, p2=(Xmat[1,2]+Xmat[2,2])/aux, alpha= Xmat[1,1]/(N*p1)) } XEMstep = function(Xvec,pvec) { N = sum(c(Xvec)) aux = N-Xvec[1]+Xvec[4] pvec[2] = (Xvec[2]+pvec[2]*Xvec[5]/(1-pvec[1]))/aux pvec[2] } ### Now input the Table given on p.3 of handout for this "obs1" data case. Xvec0 = c(30,25,45,50,58) Xmat0 = rbind(Xvec0[1:3], c(Xvec0[4],0.5*Xvec0[5]*c(1,1))) ## initialize at > pvec0 = c(50/108, (58/108)*c(25,45)/70) > pvec0 [1] 0.4629630 0.1917989 0.3452381 > gam0 = 108/208 > alp0 = (1-(2-pvec0[1])*gam0)/pvec0[1] > alp0 [1] 0.4361538 > rbind(c(alp0*pvec0[1], gam0*pvec0[2:3]),gam0*pvec0)*208 [,1] [,2] [,3] [1,] 42 20.71429 37.28571 [2,] 50 20.71429 37.28571 ### initial guess, logLik= -366.77 > MLobs1 = nlm(NegLlkObs1, c(pvec0[1:2],alp0), , hessian=T, Xvec=Xvec0) $minimum [1] 325.8568 ### maximized logLik = -325.86 $estimate [1] 0.4385959 0.2005010 0.3288463 $gradient [1] 3.416289e-05 4.877165e-05 2.125944e-05 $code [1] 1 $iterations [1] 8 ### Fitted cell probabilities > pfit1 = c(MLobs1$est[1:2], 1-sum(MLobs1$est[1:2])) gfit1 = (1-MLobs1$est[3]*pfit1[1])/(2-pfit1[1]) Xfit1 = matrix(c(MLobs1$est[3]*pfit1[1], gfit1*c(pfit1[-1],pfit1)), byrow=T, nrow=2)*208 Xfit1 [,1] [,2] [,3] [1,] 29.99996 22.8571 41.14293 [2,] 49.99997 22.8571 41.14293 ### final fit, OK ### Now let's check the final answer against formulas, ### c(pfit1[1], 2*Xvec0[4]/(sum(Xvec0)-Xvec0[1]+Xvec0[4])) ### OK [1] 0.4385962 0.4385965 c(pfit1[2], Xvec0[2]*(sum(Xvec0)-Xvec0[1]-Xvec0[4])/ ((sum(Xvec0)-Xvec0[1]+Xvec0[4])*sum(Xvec0[2:3]))) [1] 0.2005009 0.2005013 ### OK ### Now let's calculate the EM iteration, starting from Xvec0 pvec = c(pfit1[1],0.5*c(1,1)*(1-pfit1[1])) for(j in 1:15) { pvec[2] = XEMstep(Xvec0,pvec) cat("iter ",j,"; p2=",round(pvec[2],6),"\n") } iter 1 ; p2= 0.236842 iter 2 ; p2= 0.216968 iter 3 ; p2= 0.207963 iter 4 ; p2= 0.203882 iter 5 ; p2= 0.202033 iter 6 ; p2= 0.201195 iter 7 ; p2= 0.200816 iter 8 ; p2= 0.200644 iter 9 ; p2= 0.200566 iter 10 ; p2= 0.20053 iter 11 ; p2= 0.200514 iter 12 ; p2= 0.200507 iter 13 ; p2= 0.200504 iter 14 ; p2= 0.200502 iter 15 ; p2= 0.200502 ### This confirms convergence after not too many iterations to the ### same p2 value found (in pfit[2]) by maximizing L_obs directly. #------------------ ## Continue by defining log L_{obs2} for second case of observed data ## with X_{21}+X_{22} observed in place of X_{21}, X_{22} NegLlkObs2 = function(theta, Xvec) { ### negative obs-data log-likelihood as fcn of theta=(p1,p2,alpha) ### where p_3, gamma are filled in using constraints, and ### where Xvec is 5-vector of X11,X12,X13,X21+X22,X23 p1=theta[1]; p2=theta[2]; p3=1-p1-p2 alpha=theta[3]; gamma = (1-alpha*p1)/(2-p1) -( Xvec[1]*log(alpha/gamma) + sum(Xvec)*log(gamma) + sum(c(Xvec[1:2],Xvec[3]+Xvec[5])*log(c(p1,p2,p3))) + Xvec[4]*log(1-p3) ) } Xvec2 = c(30,25,45,71,37) MLE2 = MLEcom(rbind(c(30,25,45), c(50,21,37))) round(MLE2,4) p1 p2 alpha 0.4386 0.2018 0.3288 ## So start by numerically maximizing L_{obs2} directly > MLobs2 = nlm(NegLlkObs2, MLobs1$est, hessian=T, Xvec=Xvec2) $minimum [1] 320.5369 #### maximized logLik_{obs2} = -320.54 $estimate [1] 0.4107145 0.2232137 0.3511699 $gradient [1] 1.506351e-04 6.343726e-05 1.401190e-04 $code [1] 1 $iterations [1] 9 ### Next do it via EM: start with p1=.2,p2=.2,alpha=.5 thet = c(.2,.2,.5) Xmat = rbind(Xvec2[1:3],c(c(.5,.5)*Xvec2[4],Xvec2[5])) -NegLlkObs2(thet,Xvec2) ### initial logL_{obs2}=-339.638 > for(j in 1:20) { Xmat[2,1:2] = Xvec2[4]*thet[1:2]/sum(thet[1:2]) thet = MLEcom(Xmat) cat("iter ",j,"; pfit=",round(c(thet[1:2],1-sum(thet[1:2])),4), "; logL_{obs2}=",-NegLlkObs2(thet,Xvec2),"\n") } iter 1 ; pfit= 0.3326 0.2834 0.3841 ; logL_{obs2}= -321.2256 iter 2 ; pfit= 0.3544 0.2666 0.379 ; logL_{obs2}= -320.9111 iter 3 ; pfit= 0.3709 0.2539 0.3752 ; logL_{obs2}= -320.7311 iter 4 ; pfit= 0.3829 0.2446 0.3725 ; logL_{obs2}= -320.6341 iter 5 ; pfit= 0.3915 0.238 0.3705 ; logL_{obs2}= -320.5843 iter 6 ; pfit= 0.3975 0.2334 0.3691 ; logL_{obs2}= -320.5596 iter 7 ; pfit= 0.4017 0.2302 0.3681 ; logL_{obs2}= -320.5476 iter 8 ; pfit= 0.4046 0.2279 0.3675 ; logL_{obs2}= -320.5419 iter 9 ; pfit= 0.4065 0.2264 0.367 ; logL_{obs2}= -320.5392 iter 10 ; pfit= 0.4079 0.2254 0.3667 ; logL_{obs2}= -320.538 iter 11 ; pfit= 0.4088 0.2247 0.3665 ; logL_{obs2}= -320.5374 iter 12 ; pfit= 0.4094 0.2242 0.3664 ; logL_{obs2}= -320.5371 iter 13 ; pfit= 0.4098 0.2239 0.3663 ; logL_{obs2}= -320.537 iter 14 ; pfit= 0.4101 0.2237 0.3662 ; logL_{obs2}= -320.537 iter 15 ; pfit= 0.4103 0.2235 0.3662 ; logL_{obs2}= -320.5369 iter 16 ; pfit= 0.4104 0.2234 0.3661 ; logL_{obs2}= -320.5369 iter 17 ; pfit= 0.4105 0.2234 0.3661 ; logL_{obs2}= -320.5369 iter 18 ; pfit= 0.4106 0.2233 0.3661 ; logL_{obs2}= -320.5369 iter 19 ; pfit= 0.4106 0.2233 0.3661 ; logL_{obs2}= -320.5369 iter 20 ; pfit= 0.4107 0.2233 0.3661 ; logL_{obs2}= -320.5369 ### Even after these 20 iterations we have not quite converged, but almost. ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ## Up to this point we have direct and EM estimates of the observed-data MLEs for two ## different versions of the observed data from the 2x3 "complete data" contingency ## table. One of those was for the data in problem part I(c) within HW2, ie for the ## data with likelihood L_{obs,1} where X_{22}+X_{23} is observed but not X_{22} ## and X_{23} separately. ## ## The other version of the data, with likelihood L_{obs,2} corresponding to X_{21}+X_{22} ## but not X_{12} and X_{22} separately being observable, was also treated in the script ## above. In particular, the estimates for (p1,p2,alpha) were given in MLobs2$est by ## direct numerical maximization of log L_{obs,2} and in parameter "thet" above via EM. ## To complete our script, we compare the observed-data Fisher information (for the ## whole sample, not per-observation) I_{obs,2} which we computed (but did not yet display) ## in MLobs2$hessian, but recall that the order of parameter components in the R script was p1,p2,alpha, so the components must be reordered to give: > round( MLobs2$hessian[c(3,1:2), c(3,1:2)], 2) [,1] [,2] [,3] [1,] 284.13 243.08 0.00 [2,] 243.08 926.11 788.86 [3,] 0.00 788.86 1290.17 ## The EM-algorithm version of the same information matrix is obtained through ## Louis' (1982) formula (2.2) of Theorem 2.7 in the Kim-Shao book. ## (YOU ARE ASKED IN HW2 PROBLEM I(c) TO DO AN ANALOGOUS CALCULATION FOR I_{obs,1} ## BASED ON on L_{obs,1}.) ## The formula (2.22) (expanded to an estimate of information based on the whole ## observed data-sample) requires calculation of three matrices shown carefully in ## the Handout EMinfo-examp.pdf linked to the web-page under the EM handouts (1). ## The numerical computations and comparisons follow: ## Recall Xvec2 = c(30,25,45,71,37) are resp. values for X11, X12, X13, X21+X22, X23 ## and thet[c(3,1:2)] is the R variable containing (alpha,p1,p2) > alpha=thet[3]; p1=thet[1]; p2=thet[2] > thet[c(3,1:2)] alpha p1 p2 0.3512191 0.4106575 0.2232580 ## NB as a rough check on the model, gamma SHOULD BE ESTIMATED BOTH BY > as.vector(c(sum(Xvec2[4:5]/208), (1-alpha*p1)/(2-p1))) [1] 0.5192308 0.5384423 #### Not perfect ,but OK > Xp3 = Xvec2[3]+Xvec2[5] > ObsInfo2 = array( c(Xvec2[1]/alpha^2+(208-Xvec2[1])*p1^2/(1-alpha*p1)^2, (208-Xvec2[1])/(1-alpha*p1)^2, 0, (208-Xvec2[1])/(1-alpha*p1)^2, Xvec2[1]/p1^2+Xvec2[4]/(p1*(p1+p2))+Xp3/(1-p1-p2)^2+ (208-Xvec2[1])*(alpha^2/(1-alpha*p1)^2-1/(2-p1)^2)-Xvec2[4]*p2/(p1*(p1+p2)^2), Xp3/(1-p1-p2)^2+Xvec2[4]/(p1+p2)^2, 0, Xp3/(1-p1-p2)^2+Xvec2[3]/(p1+p2)^2, Xvec2[2]/p2^2+Xvec2[4]/(p2*(p1+p2))+Xp3/(1-p1-p2)^2 - Xvec2[4]*p1/(p2*(p1+p2)^2)), c(3,3)) > ObsInfo2 [,1] [,2] [,3] [1,] 284.1896 243.0562 0.0000 [2,] 243.0562 925.9515 723.8411 [3,] 0.0000 788.5421 1290.1056 ## Compare this once again with > round(MLobs2$hess[c(3,1:2),c(3,1:2)],2) [,1] [,2] [,3] [1,] 284.13 243.08 0.00 [2,] 243.08 926.11 788.86 [3,] 0.00 788.86 1290.17 ### OK THIS MATRIX IS NOW VERY CLOSE TO THE ONE OBTAINED BY DIRECT LOGLIK_{OBS} ### MAXIMIZATION, as it must be because the Louis formula (2.23) is an algebraic ### identical re-expression of the observed-data observed information !!! ### The only discrepancy is due to the difference between the EM-estimated (and not ### fully converged) thet estimator and the direct numerical max MLobs2$est. Example 2. Unbalanced ANOVA 2/25/2019 --------------------------- ## First simulate moderately large unbalanced-ANOVA dataset, m=30 nvec = sample(10:30, 30, replace=T) ### sum(nvec) = N = 600 thet0 = c(mu=4, siga=0.7, sige=0.4) avec = rnorm(30, 0, thet0[2]) yvec = thet0[1] + rep(avec, nvec) + rnorm(600,0,thet0[3]) > Xbarj = aggregate(yvec, by=list(rep(1:30,nvec)), mean)$x ## NB in this example, for each j, (X_{ij})_i ~ MVN with variance the sum pof rank-1 plus diagonal matrix ## The log-likelihood is slightly messy. NLogLikANOVA = function(thet, Yv, nv) { m = length(nv) N = length(Yv) (N-m)*log(thet[3]) + 0.5*sum(log(nv*thet[2]^2+thet[3]^2)) + sum((Yv-thet[1])^2)/(2*thet[3]^2) - (0.5*(thet[2]/thet[3])^2)*sum(nv^2*(Xbarj-thet[1])^2/(thet[3]^2+nv*thet[2]^2)) } ## First we find the true observed-data logLik as.vector(-NLogLikANOVA(thet0,yvec,nvec)) ### = 181.9727 (without the 2*pi's) ## Next the true MLE and hessian MLanova = nlm(NLogLikANOVA, c(3,1,1), hessian=T, Yv=yvec, nv=nvec) ### initial neg loglik = 116.96 > MLanova $minimum [1] -182.6577 $estimate [1] 4.1403027 0.6824724 0.4045956 ### this is the MLE $gradient [1] 9.610504e-06 3.842615e-05 7.435119e-05 $hessian [,1] [,2] [,3] [1,] 63.19978860 -0.08051131 7.178222e-02 [2,] -0.08051131 123.26335082 5.141283e+00 [3,] 0.07178222 5.14128260 6.953698e+03 $code [1] 1 $iterations [1] 14 ### NOTE that mu and sige are well estimated, while siga is estimated less precisely > round(sqrt(diag(solve(MLanova$hess))),3) ### standard errors [1] [1] 0.126 0.090 0.012 ### Next the EM which requires no log-likelihoods: EMstep = function(th.in, Yv, nv) { N = length(Yv) gam.in = th.in[2]^2/(th.in[2]^2+th.in[3]^2/nv) aux = gam.in*(Xbarj-th.in[1]) mu.out = sum(nv*(Xbarj-aux))/N c(mu.out, sqrt(mean(aux^2 + th.in[3]^2*(gam.in/nv))), sqrt(th.in[3]^2*sum(gam.in)/N + mean((Yv-mu.out- rep(aux,nv))^2)) ) } th = c(3,1,1) for(i in 1:600) { th = EMstep(th,yvec,nvec) llik = -NLogLikANOVA(th, yvec, nvec) if(i%%20 == 0) cat("iter ",i,"; theta=",round(th,4),"; logL=",llik,"\n") } iter 20 ; theta= 3.1587 1.201 0.4045 ; logL= 166.044 iter 40 ; theta= 3.2713 1.1103 0.4045 ; logL= 168.3888 iter 60 ; theta= 3.3872 1.0215 0.4045 ; logL= 170.8746 iter 80 ; theta= 3.5046 0.9374 0.4045 ; logL= 173.4253 iter 100 ; theta= 3.6203 0.8621 0.4045 ; logL= 175.8986 iter 120 ; theta= 3.7295 0.7998 0.4045 ; logL= 178.0955 iter 140 ; theta= 3.8266 0.7533 0.4046 ; logL= 179.8316 iter 160 ; theta= 3.9075 0.7224 0.4046 ; logL= 181.0365 iter 180 ; theta= 3.9712 0.7039 0.4046 ; logL= 181.7797 iter 200 ; theta= 4.019 0.6936 0.4046 ; logL= 182.1996 iter 220 ; theta= 4.0539 0.6881 0.4046 ; logL= 182.4238 iter 240 ; theta= 4.079 0.6853 0.4046 ; logL= 182.5396 iter 260 ; theta= 4.097 0.6839 0.4046 ; logL= 182.5985 iter 280 ; theta= 4.1097 0.6832 0.4046 ; logL= 182.6281 iter 300 ; theta= 4.1187 0.6828 0.4046 ; logL= 182.6429 iter 320 ; theta= 4.125 0.6826 0.4046 ; logL= 182.6503 iter 340 ; theta= 4.1295 0.6826 0.4046 ; logL= 182.654 iter 360 ; theta= 4.1327 0.6825 0.4046 ; logL= 182.6559 iter 380 ; theta= 4.1349 0.6825 0.4046 ; logL= 182.6568 iter 400 ; theta= 4.1365 0.6825 0.4046 ; logL= 182.6573 iter 420 ; theta= 4.1376 0.6825 0.4046 ; logL= 182.6575 iter 440 ; theta= 4.1384 0.6825 0.4046 ; logL= 182.6576 iter 460 ; theta= 4.139 0.6825 0.4046 ; logL= 182.6577 iter 480 ; theta= 4.1394 0.6825 0.4046 ; logL= 182.6577 iter 500 ; theta= 4.1396 0.6825 0.4046 ; logL= 182.6577 iter 520 ; theta= 4.1398 0.6825 0.4046 ; logL= 182.6577 iter 540 ; theta= 4.14 0.6825 0.4046 ; logL= 182.6577 iter 560 ; theta= 4.1401 0.6825 0.4046 ; logL= 182.6577 iter 580 ; theta= 4.1401 0.6825 0.4046 ; logL= 182.6577 iter 600 ; theta= 4.1402 0.6825 0.4046 ; logL= 182.6577 #### Note the slow convergence !!! ### Now again compare the direct negative Hessian for the observed-data logLik ### versus the Louis formula value: ... (in class as time permits)