Homework Solutions to Problem 25. ================================= [NB solution is in Splus 6.0] > hist(geyser$waiting, nclass=20, prob=T, xlab="Waiting Times", ylab="Scaled Relative Frequency", main= "Geyser Waiting Times, Histogram & Overplotted Density Estimates") > > gpts <- seq(min(geyser$waiting), max(geyser$waiting), length=50) kerMatg1 <- outer(gpts, geyser$waiting, function(x,y) dlogis(x,y,.5)) kerMatg2 <- outer(gpts, geyser$waiting, function(x,y) dlogis(x,y,3)) lines(gpts, c(kerMatg1 %*% rep(1/299,299)), lty=1) lines(gpts, c(kerMatg2 %*% rep(1/299,299)), lty=4) legend(locator(), legend=c("Bandwidth .5", "Bandwidth 3"), lty=c(1,4)) > lLk2 <- function(x, pwt, mn, scal, dens = dnorm) { dnsmat <- matrix(dens(outer(mn,x,"-")/scal)/scal, ncol=length(x)) sum(log(pwt %*% dnsmat)) } > tmpft <- nlmin(function(w) -lLk2(log(geyser$waiting), c(plogis(w[1]), 1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0.5,3.95,4.35,log(.2),log(.2)), print.level=1, max.fcal=300, max.iter=100) > tmpft $x: [1] -0.7073344 4.0009645 4.3892868 -2.3041704 -2.4458907 $converged: [1] T $conv.type: [1] "relative function convergence" ### Thus we have convergence to a mixture of two normals (for the log-waiting times) with proportions 0.3302, .6698 of |N(4.0010, 0.0998^2) and |N(4.3893, 0.0866^2) ### Now let's try a mixture of three normals: > tmpft3 <- nlmin(function(w) -lLk2(log(geyser$waiting), c(plogis(w[1]), plogis(w[2])*(1-plogis(w[1])), (1-plogis(w[1]))* (1-plogis(w[2]))), w[3:5], exp(w[6:8])), c(0.5,0.5,3.95,4.15,4.35, rep(log(.2),3)), print.level=1, max.fcal=300, max.iter=100) > tmpft3 $x: [1] -1.2087436 -0.9080385 3.9655844 4.2238913 4.3998657 -2.5495674 [7] -1.7914989 -2.5778582 $converged: [1] T $conv.type: [1] "relative function convergence" ### This gives normal with probabilty weights > c(plogis(tmpft3$x[1]),(1-plogis(tmpft3$x[1]))*plogis(tmpft3$x[2]), (1-plogis(tmpft3$x[1]))*(1-plogis(tmpft3$x[2]))) [1] 0.2299234 0.2213211 0.5487555 ### and means and scale parameters respectively given by > rbind(means=tmpft3$x[3:5], sdev=exp(tmpft3$x[6:8])) [,1] [,2] [,3] means 3.96558439 4.2238913 4.39986566 sdev 0.07811545 0.1667101 0.07593647 > hist(geyser$waiting, nclass=20, prob=T, xlab="Waiting Times", ylab="Scaled Relative Frequency", main= "Geyser Waiting Times, Histogram & Overplotted Density Estimates") ### We overplot the two and three component log-normals: > fmix function(y, pwt, mn, scal) { prb <- if(length(pwt) == 2) c(plogis(pwt[1]), (1 - plogis(pwt[ 1])) * plogis(pwt[2]), (1 - plogis(pwt[1])) * (1 - plogis( pwt[2]))) else c(plogis(pwt[1]), (1 - plogis(pwt[1]))) c(prb%*% matrix(dnorm(outer(mn,log(y),"-")/scal)/scal, ncol=length(y)))/y } > lines(seq(43,108,.5), fmix(seq(43,108,.5),tmpft$x[1], tmpft$x[2:3], exp(tmpft$x[4:5])), lty=1) > lines(seq(43,108,.5), fmix(seq(43,108,.5),tmpft3$x[1:2], tmpft3$x[3:5], exp(tmpft3$x[6:8])), lty=3) ### To compare the fit done both ways: ### logLik for 2-component and for 3-component models: > sum(log(fmix(geyser$waiting,tmpft$x[1], tmpft$x[2:3], exp(tmpft$x[4:5])))) [1] -1155.923 > sum(log(fmix(geyser$waiting,tmpft3$x[1:2], tmpft3$x[3:5], exp(tmpft3$x[6:8])))) [1] -1154.563 ### So the fit is only a tiny bit `better' with the third component, ### definitely not enough of an improvement over 2-component ### to be worth bothering with !