Class Log Nov. 27 & 29
======================

> library(MASS)
  galaxies = galaxies/1000

> kerfcn = function(xnew, bw, pts,  kern=function(x) dnorm(x)) {
       n = length(pts)
       c(outer(xnew, pts, function(x,y) kern((x-y)/bw)/bw) %*% rep(1/n,n))
  }

  > plot(density(galaxies))
  > density(galaxies)$bw
[1] 1.001839
> plot(density(galaxies, window="tria"))
> plot(density(galaxies, window="rect"))    ### same bw, similar picture in each case

> curve(kerfcn(x,1,galaxies),5,40, add=T, col="blue")
> curve(kerfcn(x,1,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="blue")
> curve(kerfcn(x,2,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="red")

## Now generate a sequence of ordered points to help in numerical integration

> pts <- density(galaxies)$x    
              ### by default, 512 ordered increasing equally spaced points
> range(galaxies)
[1]  9.172 34.279
> range(pts)
[1]  6.166482 37.284518


> LgisCV <- function(b, Pts) {
  ### little function to evaluate least-sq error criterion
  ###  for galaxies data with bin-width b
   kertmp <- outer(Pts, galaxies, function(x,y,B) dlogis(x,y,B), B=b)
   denstmp <- cbind(Pts, c(kertmp %*% rep(1/82,82)))
   delta <- c(range(Pts) %*% c(-1,1))/(length(Pts)-1)
   intsq <- delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+
        denstmp[length(Pts),2]^2))
  ### The numerical integration here uses trapezoid rule.
   penalty <- numeric(82)
   for(i in 1:82) penalty[i] <- mean(
        dlogis(galaxies[i],galaxies[-i],b))
   intsq - 2*mean(penalty)  }

> c(LgisCV(.2,pts), LgisCV(.5,pts), LgisCV(.8,pts))
-0.10124739 -0.10422158 -0.09922364 

> optimize(LgisCV, c(.1,1.5), Pts=pts)[1:2]
$minimum
[1] 0.3744886             #### as in DensNPR.Log but not in class
                 ### The mistake in class was not to use a large number 
                 ###  of equally spaced points to get accurate numerical integral.
$objective           
-0.1052371 

### To see that the result is not terribly senstive to exactly which sequence of 
###    points is used:

> unlist(optimize(LgisCV, c(.1,1.5), Pts=seq(7,39,length=200))[1:2]
   minimum  objective 
 0.3744911 -0.1052371 

#------------------------------------------------------------------
### BACK TO "DensNPR.Log" for smoothing-splines implementation

> tmp = density(galaxies, n=64)
  plot(tmp, type="p", cex=0.6, main="Overplotted `galaxies' Densities")
  rnkx = numeric(64)
  for(i in 1:64) rnkx[i] = (sum(galaxies <= tmp$x[i])+.5)/83
  splgalx = smooth.spline(tmp$x, y=rnkx, spar=0.05)
  curve(predict(splgalx, x, deriv=1)$y, 5, 40, add=T, col="blue")
  splgalx2 = smooth.spline(tmp$x, y=rnkx, spar=0.5)
  curve(predict(splgalx2, x, deriv=1)$y, 5, 40, add=T, col="orange")
        #### This shows  the effect of  spar  ,  like that of  bw .

##--------------------------------------------------------------------

### NOTE generic nature of "predict":

> xv = 1:30; yv = -3 +2*xv + rnorm(30)*0.5
> tmp = lm(yv ~ xv)

> predict(tmp, newdata=cbind.data.frame(xv=xv[1:5]))
         1          2          3          4          5 
-1.1047942  0.8920685  2.8889313  4.8857940  6.8826567 
> tmp$fit[1:5]
         1          2          3          4          5 
-1.1047942  0.8920685  2.8889313  4.8857940  6.8826567 


###>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

> library(KernSmooth)
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009

### THE FUNCTION  ksmooth  IN THIS LIBRARY IMPLEMENTS THE SAME FUNCTION WE HAVE
### GIVEN ABOVE AS "kerfcn", THE NADARAYA-WATSON [KERNEL-DENSITY] ESTIMATOR

> attach(oats)

>  search()
 [1] ".GlobalEnv"         "oats"               "package:KernSmooth"
 [4] "package:MASS"       "package:stats"      "package:graphics"  
 [7] "package:grDevices"  "package:utils"      "package:datasets"  
[10] "package:methods"    "Autoloads"          "package:base"      

Description

Estimates a probability density function, regression function or their derivatives using 
local polynomials. A fast binned implementation over an equally-spaced grid is used.

Usage

locpoly(x, y, drv = 0L, degree, kernel = "normal", 
        bandwidth, gridsize = 401L, bwdisc = 25, 
        range.x, binned = FALSE, truncate = TRUE)

### For a general math-stat reference on this topic, see the book
###    Local Regression and Likelihood, by Clive Loader,
###  available online at 
## http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/LocalRegressionBook_1999.pdf		

### Here are some usage examples of the locpoly function.
### First we return to density estimation

> plot(density(galaxies))
  curve(kerfcn(x,2,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="red")

> tmpfitg = locpoly(galaxies, degree=2, bandwidth=1)
> names(tmpfitg)
[1] "x" "y"
> lines(tmpfitg$x, pmax(tmpfitg$y,0), lty=3, col="blue")

> plot(tmpfitg$x, pmax(tmpfitg$y,0), type="l", xlim=c(5,40))
  lines(locpoly(galaxies, degree=2, bandwidth=1), lty=2, col="blue")
           ### "degree" is ignored for density estimation

### DISADVANTAGE of this function is that there is no "predict" method for it, so
###   it cannot be implemented directly as a function.
### ADVANTAGE is that derivatives can be estimated.

> pts = seq(5,40, length=101)
  rnkx = c(outer(pts, galaxies, function(x,y) y <= x) %*% rep(1/82,82) )
  tmpfitg2 = locpoly(pts, y=rnkx, drv=1L, degree=2, bandwidth=1)
> lines(tmpfitg2)

plot(density(galaxies))
  curve(kerfcn(x,2,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="red")
lines(tmpfitg2, lty=3, col="red")
lines(locpoly(pts, y=rnkx, drv=1L, degree=2, bandwidth=2), lty=4, col="orange")

#---------------------------------------------------------------------------
####  Now we move on to NONPARAMETRIC REGRESSION, where we can benefit from 
####    choice of degree and bandwidth.
#### [Consider the use of the cross-validated density-estimation bandwidth for

### We illustrate throughout with dataset "geyser" in MASS (similar to "faithful" 
### in base datasets.

> dim(geyser)
[1] 299   2
> plot(geyser)

### What kind of model does this suggest ??? We will return to this question 
### after figuring out a curve for E(Y|X=x) around which to calculate residuals.

## One type of curve to "fit" is "polynomial regression"

> tmp.lin = lm(duration ~ waiting, data=geyser)
  curve(predict(tmp.lin, newdata=cbind.data.frame(waiting=x)), c(40,110), add=T, col="blue")
  abline(tmp.lin$coef, col="red")    
         ### for LINEAR regression, the prediction curve is the line
### Now consider regression on first, 2nd and 3rd powers of waiting

> geyser3 = cbind.data.frame(waiting=geyser$wait, waitsq=geyser$wait^2, 
              waitcb=geyser$wait^3, duration=geyser[2])
> names(geyser3)
[1] "waiting"  "waitsq"   "waitcb"   "duration"
  tmp.poly = lm(duration ~ ., data=geyser3)

> summary(diff(geyser$wait))             #### WAIT IS NOT SORTED INCREASING !!
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-58.00000 -21.00000  -2.00000  -0.00336  24.00000  48.00000 
  ordw = order(geyser$wait)
  lines(geyser$wait[ordw], tmp.poly$fit[ordw], lty=2, col="orange")

### This curve gives some idea of Y given X behavior, but if we want E(Y|X=x)
###   we can do much better.

> plot(geyser)

  locfit0 = locpoly(geyser$wait, y=geyser$dur, degree=0, bandw=2)
  lines(locfit0, col="red")
  locfit3 = locpoly(geyser$wait, y=geyser$dur, degree=3, bandw=2)
  lines(locfit3, col="blue")
         ### Different behavior, gets crazy in gap between points in right tail
  locfit3B = locpoly(geyser$wait, y=geyser$dur, degree=3, gridsize=30,
            bandw = c(rep(2,27), 5,10,15))
  lines(locfit3B, col="orange")
        ### Although the R function allows us to alter the bandwidth in different 
     ### regions it is still somewhat hard to keep the function smooth. 

### So let's try smoothing spline as another possibility.

> splgeys = smooth.spline(geyser$wait, geyser$dur, spar=0.3)
  summary(diff(splgeys$x))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.275   1.000  10.000    ### ordered increasing !!
> length(splgeys$x)
[1] 52                     #### = number of unique geyser$waiting times

splgeysA = smooth.spline(geyser$wait, geyser$dur, all.knots=T, spar=0.3)
  ### results not quite identical near endpoint, but otherwise seems yes

> lines(splgeys$x,splgeys$y, lty=3, col="brown")  ## OK, good fit but not very smooth
  splgeys2 = smooth.spline(geyser$wait, geyser$dur, spar=0.6)
  lines(splgeys2, lty=5, col="orange")
  lines(locfit0, col="red")
        ### contrast between spline and local polynomial degree 0

### OK, now return to the problem of modeling:

> plot(splgeys2$x, splgeys2$yin-splgeys2$y)
  abline(v=68, col="green", lty=8)
  abline(h=0, col="green", lty=8)

### Here our opinion of modeling might have changed. What do you think ?

##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Final Topic/Trick: INVERSION USIING SPLINES

##  Suppose we have a density f  or cdf  F  given analytically and we want a function
##     to approximate the inverse of F   [eg for quick simulation]

## WE CAN FIT A FUNCTION TO EVALUATED POINTS  (F(x[i]), x[i])  via smoothing-splines!!

## EXAMPLE:     f(x) = dgamma(x, 1.3, 2.2)

> curve(dgamma(x,1.3,2.2),0,3.5)
  curve(pgamma(x,1.3,2.2),0,3.5)

> xpts = seq(0,5,length=251)
  ypts = pgamma(xpts,1.3,2.2)

  plot(ypts,xpts)

  Finv = function(x) predict(smooth.spline(ypts,xpts, spar=0.6), x=x)$y
  curve(Finv(x),0,1, col="red", add=T)

rvec = runif(1000)
summary( qgamma(rvec,1.3,2.2) - Finv(rvec) )
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-3.135e-04 -1.185e-06 -1.677e-07  1.154e-05  1.502e-06  1.645e-03