R Log Related to Density Estimation
===================================  4/30/08

> library(MASS)
> galaxies = galaxies/1000        ### 82 obs of galaxy `speeds'
> summary(galaxies)
  Min. 1st Qu. Median  Mean 3rd Qu.  Max. 
 9.172   19.53  20.83 20.83   23.13 34.28
> hist(galaxies, nclass=16, prob=T, xlim=c(7,40),
 ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
 main= "Gaussian Window, Default")
> dens1 = density(galaxies, n=50)
> names(dens1)
[1] "x" "y"  "bw"  "n"  "call" "data.name"  "has.na"   
> lines(dens1$x,dens1$y)

> dens2 = density(galaxies,window="r")
  hist(galaxies, nclass=16, prob=T, xlim=c(7,40),
 ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
 main= "Rectang Window, Default")
  lines(dens2$x,dens2$y)

> pts = dens1$x
  kerMat = outer(pts, galaxies, function(x,y) dlogis(x,y,.2))
> dim(kerMat)
[1] 50 82
> dens3 = cbind(pts, c(kerMat %*% rep(1/82,82)))

> hist(galaxies, nclass=16, prob=T, xlim=c(7,40),
 ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
 main= "Logistic Kernel, b=.2")
  lines(pts,dens3[,2])

> kerMatB = outer(pts, galaxies, function(x,y) dlogis(x,y,.5))
> hist(galaxies, nclass=16, prob=T, xlim=c(7,40),
 ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
 main= "Logistic Kernel, b=.5")
  lines(pts,c(kerMatB %*% rep(1/82,82)))

###   We can see from this picture, saved as DensGalx.ps, that 
### the choice of bandwidth has MORE effect than the choice of 
### window in determining how the density estimate looks.

###   Next consider a spline-based estimate, using the same data.
### We use all 82 points as "knots" for the smoothing-spline.

> rnkx = numeric(50)
  for (i in 1:50) rnkx[i] = sum(galaxies<= pts[i])/82
  splgalx = smooth.spline(pts, rnkx, spar=1.e-6)
  plot(pts,rnkx)
  lines(pts, predict(splgalx,pts)$y)

> par(mfrow=c(2,2))
 spars = c(1.e-6,1.e-5,5.e-5, 1.e-4)
 for(i in 1:4) {
 hist(galaxies, nclass=16, prob=T, xlim=c(7,40),
   ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
   main= paste("Spline Fit, spar=",spars[i],sep=""))
 splgalx <- smooth.spline(pts, rnkx, spar=spars[i])
 lines(pts, predict(splgalx,pts,1)$y) }
> printgraph(file="SplinDns.ps")

-----------------------------------------------------------

### Now let's compare with mixture of logistics:

> lLk = function(x, pwt, mn, scal, dens = dlogis) {
   invsc = 1/scal
   dnsmat = invsc*matrix(dens(invsc*
        outer(mn,x,"-")), ncol=length(x))
   sum(log(pwt %*% dnsmat)) }

> lLk(galaxies, c(.5,.5),c(20,30),c(2,2))  ## -268.8732

> lLk(galaxies,1,21.074,2.201) ## = -233.5441

> lLk(galaxies,c(0.01,.99,0),c(9.7,21.074,34),c(1,2.201,1.5)) 
[1] -229.6879

> lLk(galaxies,c(0.01,.99,0),c(9.7,21.074,34),c(0.5,2.201,1.5)) 
[1] -227.4462

> tmpft <- nlm(function(w)  -lLk(galaxies,c(plogis(w[1]),
   1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0,20,30,2,2), 
   print.level=1, iterlim=100)
 ...
iteration = 23
Parameter:
[1]  1.1663503 21.2954749 18.7628547  0.1574718  1.6160380
Function Value
[1] 222.5247
Gradient:
[1]  8.823679e-07 -2.274227e-06 -1.632106e-07 -5.598793e-06  6.281384e-06

Relative gradient close to zero.
Current iterate is probably solution.

> tmpft$est
[1]  1.1663503 21.2954749 18.7628547  0.1574718  1.6160380

### So assign mean 21.295 with weight plogis(1.1663) = 0.762

### Now try again with three components, small scales

> tmpft = nlm(function(w)  -lLk(galaxies,c(plogis(w[1]),
   (1-plogis(w[1]))*plogis(w[2]),(1-plogis(w[1]))*
   (1-plogis(w[2]))), w[3:5], exp(w[6:8])), c(0,0,20,25,30,.2,.3,.3), 
   print.level=1, iterlim=100)
 ...
iteration = 75
Parameter:
[1] -2.3741589  3.1874677  9.6708678 21.3220838 32.9733702 -1.3747431  0.2409259
[8] -0.5664953
Function Value
[1] 204.58
Gradient:
[1] -3.112532e-07 -7.133364e-07  7.464805e-06  4.318825e-07  1.746330e-06
[6]  2.480904e-07 -1.818989e-06  0.000000e+00

Relative gradient close to zero.
Current iterate is probably solution.

> tmpft$est
[1] -2.3741589  3.1874677  9.6708678 21.3220838 32.9733702 
[6] -1.3747431  0.2409259 -0.5664953
### Now weights are: .085, 0.8785, .0365, means 9.67, 21.322, 32.97

> hist(galaxies, nclass=24, prob=T, xlim=c(7,40),
   ylim=c(0,.22), xlab="Galaxy speed", ylab="Rel Freq", main=
   "Best fit to Galaxies Density with up to 3 Symm Comp's")
  lines(pts, .085*dnorm(pts,9.671, exp(-1.3747))+.8785*
    dnorm(pts,21.322,exp(.24093))+.0365*dnorm(pts,32.97,
    exp(-.5665)), lty=3)

  #### Much better fit !!! 

> printgraph(file="GalxFt.ps")

### So far, we have obtained:
    logLik = -222.52 with 2 components (5 free parameters)
	   = -204.60 with 3 components (7 free parameters)

### Finally, we try 4 components:

> tmpft = nlm(function(w)  -lLk(galaxies,c(plogis(w[1]),
   (1-plogis(w[1]))*plogis(w[2]), (1-plogis(w[1]))*
   (1-plogis(w[2]))*plogis(w[3]), (1-plogis(w[1]))*
   (1-plogis(w[2]))*(1-plogis(w[3]))), w[4:7], exp(w[8:11])), 
  c(-2,2,1,10,20,23,33,.2,.3,.3,.2), 
   print.level=1, iterlim=100)
...
iteration = 32
Parameter:
 [1] -2.3722735 -0.8747908  2.8233345  9.6715988 19.7329128 22.2472937
 [7] 32.9745375 -1.3736705 -1.1658544  0.2159503 -0.5665459
Function Value
[1] 197.6925
Gradient:
 [1]  6.253972e-06 -9.094947e-07  1.248273e-06 -2.306862e-06  2.589695e-06
 [6]  2.405600e-06  2.212141e-05 -1.303491e-06 -4.656282e-06  5.684342e-08
[11]  4.689582e-06

Successive iterates within tolerance.
Current iterate is probably solution.

### logLik = -197.7

> tmpft$x
 [1]  -2.3722718  -0.8747788   2.8233325   9.6716033  19.7329244  22.2473120
 [7]  32.9745451  -1.3736665  -1.1658495   0.2159502  -0.5665471

### `Final' answer: weights .08531, 0.26916, 0.60933, 0.0362
## means  9.671603 19.732924 22.247312 32.974545
## scales  0.2531770 0.3116578 1.2410406 0.5674815

> sum(log(.08531*dlogis(galaxies, 9.671603, 0.2531770) + .26916*dlogis(
    galaxies, 19.732924, 0.3116578) + .60933*dlogis(galaxies,
    22.247312, 1.2410406) + 0.0362*dlogis(galaxies,32.974545,
    0.5674815)))
[1] -197.6925

> hist(galaxies, nclass=24, prob=T, xlim=c(7,40),
   ylim=c(0,.22), xlab="Galaxy speed", ylab="Rel Freq", main=
   "Best fit to Galaxies Density with  4 Symm Comp's")
  lines(pts, .08531*dlogis(pts, 9.671603, 0.2531770) + .26916*dlogis(
    pts, 19.732924, 0.3116578) + .60933*dlogis(pts,
    22.247312, 1.2410406) + 0.0362*dlogis(pts,32.974545,
    0.5674815), lty=3)
### Really nice fit !!! We can see this becoming better and better
   with more components !!!

> printgraph(file="GalxFt2.ps")