Log Related to Density Estimation & Nonparametric Regression
=================================  4/28/04 updated 11/30/2009

## Examples use "galaxies" data (in web-page 
##    data-directory) along with "geyser"
##    data supplied in standard Splus6.0 
##    and in library "MASS" within R


Selected statements/functions for class use.
--------------------------------------------

### > galaxies = scan("galaxies.dat", skip=3)/1000
> galaxies = galaxies/1000
> summary(galaxies)
  Min. 1st Qu. Median  Mean 3rd Qu.  Max. 
 9.172   19.53  20.83 20.83   23.13 34.28
> hist(galaxies, nclass=16, prob=T, xlim=c(4,40),
 ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
 main= "Gaussian Window, Default")
  
> dens1 = density(galaxies)
   lines(dens1)

> hist(galaxies, nclass=16, prob=T, xlim=c(4,40),
 ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
 main= "Rectang Window, Default")
  lines(density(galaxies,window="r"))

> pts <- dens1$x
  kerMat <- outer(pts, galaxies, function(x,y) dlogis(x,y,.2)
> dim(kerMat)
[1] 50 82
> dens3 = cbind(pts, c(kerMat %*% rep(1/82,82)))

=== Next a little bit on cross-validated bandwidth selection ===
WHAT WE DISCUSS HERE IS "LEAST-SQUARES CROSS-VALIDATION".
More details can be found in:
   W Haerdle, SMOOTHING TECHNIQUES WITH IMPLEMENTATIION IN S
	      (Springer-Verlag book, 1991), Sec 4.3.2
   Scott, D. and Terrell, G. (1987) JASA v.82, p.1131 article. 

> LgisCV <- function(b, Pts) {
  ### little function to evaluate least-sq error criterion
  ###  for galaxies data with bin-width b
   kertmp <- outer(Pts, galaxies, function(x,y,B) dlogis(x,y,B), B=b)
   denstmp <- cbind(Pts, c(kertmp %*% rep(1/82,82)))
   delta <- c(range(Pts) %*% c(-1,1))/(length(Pts)-1)
   intsq <- delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+
        denstmp[length(Pts),2]^2))
  ### The numerical integration here uses trapezoid rule.
   penalty <- numeric(82)
   for(i in 1:82) penalty[i] <- mean(
        dlogis(galaxies[i],galaxies[-i],b))
   intsq - 2*mean(penalty)  }

### NOTE: minimizing this function gives a 
###   "Cross-Validated Bandwidth" b .

> LgisCV(.2,pts)       
 -0.0998805
> LgisCV(.5,pts)      
 -0.1042199
> LgisCV(.8,pts)          
 -0.09922277

> optimize(LgisCV, c(.1,1.5), Pts=pts)[1:2]
$minimum:
[1] 0.3783803

$objective:
[1] -0.1052062

### Thus the optimized bandwidth is 0.3784:
> lines(pts, c(outer(pts, galaxies, function(x,y) 
       dlogis(x,y, 0.3784)) %*% rep(1/82,82)), lty=4)
### Adding this curve to either of the Gaussian or 
###    Rectangular default plots above shows a really 
###    good job of fitting !!

------------------------------------------------------------
### Next: spline-based estimate, using all points as "knots".
### First get spline estimate of cdf, then density using
###    deriv=1 in "predict" applied  to smooth.spline object.

> pts = density(galaxies)$x[(1:50)*10]

> rnkx = numeric(50)
  for (i in 1:50) rnkx[i] = sum(galaxies<= pts[i])/82
> splgalx = smooth.spline(pts, rnkx, spar=.4)
> plot(pts,rnkx)
  lines(pts, predict(splgalx,pts)$y)

> par(mfrow=c(2,2))
 spars = c(.1, .3, .6, .9)
 for(i in 1:4) {
 hist(galaxies, nclass=16, prob=T, xlim=c(7,40),
   ylim=c(0,.18), xlab="Galaxy speed", ylab="Rel Freq", 
   main= paste("Spline Fit, spar=",spars[i],sep=""))
 splgalx = smooth.spline(pts, rnkx, spar=spars[i])
 lines(pts, predict(splgalx,pts,1)$y) }

### can also use "postscript" or "pdf" command to save 
### graphs produced in this way, but on separate pages
### quit and produce graphs with final command dev.off().

-----------------------------------------------------------

### Now let's compare with mixture of logistics:

> lLk = function(x, pwt, mn, scal, dens = dlogis) {
   invsc = 1/scal
   dnsmat = invsc*matrix(dens(invsc*
        outer(mn,x,"-")), ncol=length(x))
   sum(log(pwt %*% dnsmat)) }

> lLk(galaxies, c(.5,.5),c(20,30),c(2,2))  ## -268.8732

> tmpft = nlm(function(w)  -lLk(galaxies,c(plogis(w[1]),
   1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0,20,30,2,2), 
   print.level=1, iterlim=100)iteration = 0
Step:
[1] 0 0 0 0 0
Parameter:
[1]  0 20 30  2  2
Function Value
[1] 296.6142
Gradient:
[1] -6.6289576 -0.1329425  2.2508516 39.7330157 11.2657407

iteration = 23
Parameter:
[1]  1.1663503 21.2954749 18.7628547  0.1574718  1.6160380
Function Value
[1] 222.5247
Gradient:
[1]  8.795656e-07 -2.276215e-06 -1.631197e-07 -5.599645e-06  6.281384e-06

Relative gradient close to zero.
Current iterate is probably solution.

> tmpft$est
1]  1.1663496 21.2954746 18.7628558  0.1574711  1.6160374
### So assign mean 21.295 with weight plogis(1.1663) = 0.762

============================================================
Remainder of this log has selected Nonparametric Regression 
and Smoothing steps for class 4/30/04, updated  12/2/09
----------------------------------------------------------

> gfram = cbind.data.frame(Wait=geyser$waiting, 
      Lgth=geyser$duration)
  plot(gfram[,1], gfram[,2], xlab="Wait", ylab="Lgth", main=
      "Scatterplot of Geyser Data with Lowess, n=299")

> for(i in 1:4) abline(v=42+i*12, lty=3)
  for(i in 1:5) {
   inds = (1:299)[abs(gfram$Wait-42-12*i+6) < 6]
   itmp = order(gfram$Wait[inds])
   lines(gfram$Wait[inds[itmp]], lm(Lgth ~ Wait, data=gfram[
       inds,])$fit[itmp], lty=6) }
> tmplow <- lowess(gfram$Wait,gfram$Lgth)
  lines(tmplow$x,tmplow$y, lty=4)

### Now kernel-based nonparametric-regression fit:
### Take expectations near each of 50 evenly spaced points
> xp = seq(40,110, length=50)
  kermat = outer(xp, gfram$Wait, function(x,y) dnorm(x,y,1))
### Our bandwidth is 1, using Gaussian kernel.
### Now create conditional expectation using kernel as weights
> cexp <- c((kermat %*% gfram$Lgth)/kermat %*% rep(1,299))
  points(xp, cexp, pch=5)

> tmpsp = smooth.spline(gfram$Wait,gfram$Lgth, all.knots=F, spar=.4)
  lines(tmpsp$x, tmpsp$y, lty=1)
  legend(locator(), legend=c("Piecewise linear","Lowess","Spline"), 
       lty=c(6,4,1))
  text(locator(),paste("Hollow diamonds are \n",
     "kernel-regression points"))
> printgraph(file="geyserNPR.ps")  ### changed in R !!!

----- Remains to use Cross-Validation to check --------------
Predictive Success of all of these modelling Strategies !

Method: leave 30 points out (at random), fit using the rest
        by all of the NPR methods (lowess, spline, 
        kernel-regression). Repeat 1000 times to get averaged 
        mean-square prediction error per obs !

## Let's try to do cross-validation using only spline 
   and kernel regression, since with lowess or the related function
   loess.smooth, we do not automatically have smoothed-function
   evaluations at newly specified points. (The documentation just
   suggests to use "approx" for lowess fits, or linear interpolation,
   to get smoothed-function approximations at new points.)

>  splerr = kererr = numeric(100)
  for (i in 1:100) {
     leftout = sample(299, 30)
     leftin = setdiff(1:299,leftout)
     tmpsp = smooth.spline(gfram$Wait[leftin],gfram$Lgth[leftin], 
         all.knots=F, spar=0.4)
     splpred = predict(tmpsp, gfram$Wait[leftout])$y
     kermat = outer(gfram$Wait[leftout], gfram$Wait[leftin], function(x,y)
         dnorm(x,y,1)) 
     yexp = c((kermat %*% gfram$Lgth[leftin])/
         kermat %*% rep(1,269))
     splerr[i] = mean((splpred-gfram$Lgth[leftout])^2)
     kererr[i] = mean((yexp-gfram$Lgth[leftout])^2)         }

>  summary(splerr)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 0.3695  0.6809  0.7945  0.8081  0.9059  1.3180
> length(splerr)
[1] 100
> summary(kererr)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 0.3688  0.6761  0.7842  0.7856  0.8897  1.1940

### This comparison seems to show that overall, the kernel-density
    estimator is doing a little better than the spline. Maybe we can
    broaden the comparison a little by trying the spline again ,with
    smoothing parameter a little larger, and the kernel-density with a
    bandwidth optimized by least-squares cross-validation for the
    x-coordinate data alone ...

> LgisCV = function(b, Pts, datavec) {
  ### little function to evaluate least-sq error criterion
  ###  for galaxies data with bin-width b
   nd = length(datavec)
   kertmp = outer(Pts, datavec, function(x,y,B) dlogis(x,y,B), B=b)
   denstmp = cbind(Pts, c(kertmp %*% rep(1/nd,nd)))
   delta = c(range(Pts) %*% c(-1,1))/(length(Pts)-1)
   intsq = delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+
        denstmp[length(Pts),2]^2))
  ### The numerical integration here uses trapezoid rule.
   penalty = numeric(nd)
   for(i in 1:nd) penalty[i] = mean(dlogis(datavec[i],datavec[-i],b))
   intsq - 2*mean(penalty)  }
> optimize(LgisCV, c(.1,2), Pts=45:105, datavec=gfram$Wait)[1:2]
$minimum:
[1] 1.25539
$objective:
[1] -0.02346868

> splerr2 = kererr2 = numeric(100)
  for (i in 1:100) {
     leftout = sample(299, 30)
     leftin = setdiff(1:299,leftout)
     tmpsp = smooth.spline(gfram$Wait[leftin],gfram$Lgth[leftin], 
         all.knots=F, spar=0.6)
     splpred = predict(tmpsp, gfram$Wait[leftout])$y
     kermat = outer(gfram$Wait[leftout], gfram$Wait[leftin], function(x,y)
         dnorm(x,y,1.2554)) 
     yexp = c((kermat %*% gfram$Lgth[leftin])/
         kermat %*% rep(1,269))
     splerr2[i] = mean((splpred-gfram$Lgth[leftout])^2)
     kererr2[i] = mean((yexp-gfram$Lgth[leftout])^2) }

> rbind(summary(splerr),summary(kererr), 
     summary(splerr2), summary(kererr2))
       Min. 1st Qu. Median   Mean 3rd Qu.  Max.
[1,] 0.3695  0.6809 0.7945 0.8081  0.9059 1.318
[2,] 0.3688  0.6761 0.7842 0.7856  0.8897 1.194
[3,] 0.4380  0.6883 0.7976 0.7893  0.8768 1.187
[4,] 0.4487  0.7124 0.7942 0.8059  0.9052 1.208

### Of these methods, the cross-validated-bandwidth kernel NPR methods
###   seems to be best, but the cross-validated bandwidth choice was
### not precisely designed to do best with respect to the task at hand 
###   (predicting y's from smoothed conditional expectations given x)
###   and it didn't !!

### "Cross-validated" bandwidth selection could be done with criterion
###   equal to the mean squared prediction error, and then by
###   definition it would be best, but one needs sophisticated large
###   sample theory to say whether it is enough better to be
###   worthwhile ...