R Log Related to Density Estimation =================================== 4/30/08 > library(MASS) > galaxies = galaxies/1000 ### 82 obs of galaxy `speeds' > summary(galaxies) Min. 1st Qu. Median Mean 3rd Qu. Max. 9.172 19.53 20.83 20.83 23.13 34.28 > hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Gaussian Window, Default") > dens1 = density(galaxies, n=50) > names(dens1) [1] "x" "y" "bw" "n" "call" "data.name" "has.na" > lines(dens1$x,dens1$y) > dens2 = density(galaxies,window="r") hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Rectang Window, Default") lines(dens2$x,dens2$y) > pts = dens1$x kerMat = outer(pts, galaxies, function(x,y) dlogis(x,y,.2)) > dim(kerMat) [1] 50 82 > dens3 = cbind(pts, c(kerMat %*% rep(1/82,82))) > hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Logistic Kernel, b=.2") lines(pts,dens3[,2]) > kerMatB = outer(pts, galaxies, function(x,y) dlogis(x,y,.5)) > hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= "Logistic Kernel, b=.5") lines(pts,c(kerMatB %*% rep(1/82,82))) ### We can see from this picture, saved as DensGalx.ps, that ### the choice of bandwidth has MORE effect than the choice of ### window in determining how the density estimate looks. ### Next consider a spline-based estimate, using the same data. ### We use all 82 points as "knots" for the smoothing-spline. > rnkx = numeric(50) for (i in 1:50) rnkx[i] = sum(galaxies<= pts[i])/82 splgalx = smooth.spline(pts, rnkx, spar=1.e-6) plot(pts,rnkx) lines(pts, predict(splgalx,pts)$y) > par(mfrow=c(2,2)) spars = c(1.e-6,1.e-5,5.e-5, 1.e-4) for(i in 1:4) { hist(galaxies, nclass=16, prob=T, xlim=c(7,40), ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", main= paste("Spline Fit, spar=",spars[i],sep="")) splgalx <- smooth.spline(pts, rnkx, spar=spars[i]) lines(pts, predict(splgalx,pts,1)$y) } > printgraph(file="SplinDns.ps") ----------------------------------------------------------- ### Now let's compare with mixture of logistics: > lLk = function(x, pwt, mn, scal, dens = dlogis) { invsc = 1/scal dnsmat = invsc*matrix(dens(invsc* outer(mn,x,"-")), ncol=length(x)) sum(log(pwt %*% dnsmat)) } > lLk(galaxies, c(.5,.5),c(20,30),c(2,2)) ## -268.8732 > lLk(galaxies,1,21.074,2.201) ## = -233.5441 > lLk(galaxies,c(0.01,.99,0),c(9.7,21.074,34),c(1,2.201,1.5)) [1] -229.6879 > lLk(galaxies,c(0.01,.99,0),c(9.7,21.074,34),c(0.5,2.201,1.5)) [1] -227.4462 > tmpft <- nlm(function(w) -lLk(galaxies,c(plogis(w[1]), 1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0,20,30,2,2), print.level=1, iterlim=100) ... iteration = 23 Parameter: [1] 1.1663503 21.2954749 18.7628547 0.1574718 1.6160380 Function Value [1] 222.5247 Gradient: [1] 8.823679e-07 -2.274227e-06 -1.632106e-07 -5.598793e-06 6.281384e-06 Relative gradient close to zero. Current iterate is probably solution. > tmpft$est [1] 1.1663503 21.2954749 18.7628547 0.1574718 1.6160380 ### So assign mean 21.295 with weight plogis(1.1663) = 0.762 ### Now try again with three components, small scales > tmpft = nlm(function(w) -lLk(galaxies,c(plogis(w[1]), (1-plogis(w[1]))*plogis(w[2]),(1-plogis(w[1]))* (1-plogis(w[2]))), w[3:5], exp(w[6:8])), c(0,0,20,25,30,.2,.3,.3), print.level=1, iterlim=100) ... iteration = 75 Parameter: [1] -2.3741589 3.1874677 9.6708678 21.3220838 32.9733702 -1.3747431 0.2409259 [8] -0.5664953 Function Value [1] 204.58 Gradient: [1] -3.112532e-07 -7.133364e-07 7.464805e-06 4.318825e-07 1.746330e-06 [6] 2.480904e-07 -1.818989e-06 0.000000e+00 Relative gradient close to zero. Current iterate is probably solution. > tmpft$est [1] -2.3741589 3.1874677 9.6708678 21.3220838 32.9733702 [6] -1.3747431 0.2409259 -0.5664953 ### Now weights are: .085, 0.8785, .0365, means 9.67, 21.322, 32.97 > hist(galaxies, nclass=24, prob=T, xlim=c(7,40), ylim=c(0,.22), xlab="Galaxy speed", ylab="Rel Freq", main= "Best fit to Galaxies Density with up to 3 Symm Comp's") lines(pts, .085*dnorm(pts,9.671, exp(-1.3747))+.8785* dnorm(pts,21.322,exp(.24093))+.0365*dnorm(pts,32.97, exp(-.5665)), lty=3) #### Much better fit !!! > printgraph(file="GalxFt.ps") ### So far, we have obtained: logLik = -222.52 with 2 components (5 free parameters) = -204.60 with 3 components (7 free parameters) ### Finally, we try 4 components: > tmpft = nlm(function(w) -lLk(galaxies,c(plogis(w[1]), (1-plogis(w[1]))*plogis(w[2]), (1-plogis(w[1]))* (1-plogis(w[2]))*plogis(w[3]), (1-plogis(w[1]))* (1-plogis(w[2]))*(1-plogis(w[3]))), w[4:7], exp(w[8:11])), c(-2,2,1,10,20,23,33,.2,.3,.3,.2), print.level=1, iterlim=100) ... iteration = 32 Parameter: [1] -2.3722735 -0.8747908 2.8233345 9.6715988 19.7329128 22.2472937 [7] 32.9745375 -1.3736705 -1.1658544 0.2159503 -0.5665459 Function Value [1] 197.6925 Gradient: [1] 6.253972e-06 -9.094947e-07 1.248273e-06 -2.306862e-06 2.589695e-06 [6] 2.405600e-06 2.212141e-05 -1.303491e-06 -4.656282e-06 5.684342e-08 [11] 4.689582e-06 Successive iterates within tolerance. Current iterate is probably solution. ### logLik = -197.7 > tmpft$x [1] -2.3722718 -0.8747788 2.8233325 9.6716033 19.7329244 22.2473120 [7] 32.9745451 -1.3736665 -1.1658495 0.2159502 -0.5665471 ### `Final' answer: weights .08531, 0.26916, 0.60933, 0.0362 ## means 9.671603 19.732924 22.247312 32.974545 ## scales 0.2531770 0.3116578 1.2410406 0.5674815 > sum(log(.08531*dlogis(galaxies, 9.671603, 0.2531770) + .26916*dlogis( galaxies, 19.732924, 0.3116578) + .60933*dlogis(galaxies, 22.247312, 1.2410406) + 0.0362*dlogis(galaxies,32.974545, 0.5674815))) [1] -197.6925 > hist(galaxies, nclass=24, prob=T, xlim=c(7,40), ylim=c(0,.22), xlab="Galaxy speed", ylab="Rel Freq", main= "Best fit to Galaxies Density with 4 Symm Comp's") lines(pts, .08531*dlogis(pts, 9.671603, 0.2531770) + .26916*dlogis( pts, 19.732924, 0.3116578) + .60933*dlogis(pts, 22.247312, 1.2410406) + 0.0362*dlogis(pts,32.974545, 0.5674815), lty=3) ### Really nice fit !!! We can see this becoming better and better with more components !!! > printgraph(file="GalxFt2.ps")