Class Log Nov. 27 & 29 ====================== > library(MASS) galaxies = galaxies/1000 > kerfcn = function(xnew, bw, pts, kern=function(x) dnorm(x)) { n = length(pts) c(outer(xnew, pts, function(x,y) kern((x-y)/bw)/bw) %*% rep(1/n,n)) } > plot(density(galaxies)) > density(galaxies)$bw [1] 1.001839 > plot(density(galaxies, window="tria")) > plot(density(galaxies, window="rect")) ### same bw, similar picture in each case > curve(kerfcn(x,1,galaxies),5,40, add=T, col="blue") > curve(kerfcn(x,1,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="blue") > curve(kerfcn(x,2,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="red") ## Now generate a sequence of ordered points to help in numerical integration > pts <- density(galaxies)$x ### by default, 512 ordered increasing equally spaced points > range(galaxies) [1] 9.172 34.279 > range(pts) [1] 6.166482 37.284518 > LgisCV <- function(b, Pts) { ### little function to evaluate least-sq error criterion ### for galaxies data with bin-width b kertmp <- outer(Pts, galaxies, function(x,y,B) dlogis(x,y,B), B=b) denstmp <- cbind(Pts, c(kertmp %*% rep(1/82,82))) delta <- c(range(Pts) %*% c(-1,1))/(length(Pts)-1) intsq <- delta*(sum(denstmp[,2]^2) - 0.5*(denstmp[1,2]^2+ denstmp[length(Pts),2]^2)) ### The numerical integration here uses trapezoid rule. penalty <- numeric(82) for(i in 1:82) penalty[i] <- mean( dlogis(galaxies[i],galaxies[-i],b)) intsq - 2*mean(penalty) } > c(LgisCV(.2,pts), LgisCV(.5,pts), LgisCV(.8,pts)) -0.10124739 -0.10422158 -0.09922364 > optimize(LgisCV, c(.1,1.5), Pts=pts)[1:2] $minimum [1] 0.3744886 #### as in DensNPR.Log but not in class ### The mistake in class was not to use a large number ### of equally spaced points to get accurate numerical integral. $objective -0.1052371 ### To see that the result is not terribly senstive to exactly which sequence of ### points is used: > unlist(optimize(LgisCV, c(.1,1.5), Pts=seq(7,39,length=200))[1:2] minimum objective 0.3744911 -0.1052371 #------------------------------------------------------------------ ### BACK TO "DensNPR.Log" for smoothing-splines implementation > tmp = density(galaxies, n=64) plot(tmp, type="p", cex=0.6, main="Overplotted `galaxies' Densities") rnkx = numeric(64) for(i in 1:64) rnkx[i] = (sum(galaxies <= tmp$x[i])+.5)/83 splgalx = smooth.spline(tmp$x, y=rnkx, spar=0.05) curve(predict(splgalx, x, deriv=1)$y, 5, 40, add=T, col="blue") splgalx2 = smooth.spline(tmp$x, y=rnkx, spar=0.5) curve(predict(splgalx2, x, deriv=1)$y, 5, 40, add=T, col="orange") #### This shows the effect of spar , like that of bw . ##-------------------------------------------------------------------- ### NOTE generic nature of "predict": > xv = 1:30; yv = -3 +2*xv + rnorm(30)*0.5 > tmp = lm(yv ~ xv) > predict(tmp, newdata=cbind.data.frame(xv=xv[1:5])) 1 2 3 4 5 -1.1047942 0.8920685 2.8889313 4.8857940 6.8826567 > tmp$fit[1:5] 1 2 3 4 5 -1.1047942 0.8920685 2.8889313 4.8857940 6.8826567 ###>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> > library(KernSmooth) KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009 ### THE FUNCTION ksmooth IN THIS LIBRARY IMPLEMENTS THE SAME FUNCTION WE HAVE ### GIVEN ABOVE AS "kerfcn", THE NADARAYA-WATSON [KERNEL-DENSITY] ESTIMATOR > attach(oats) > search() [1] ".GlobalEnv" "oats" "package:KernSmooth" [4] "package:MASS" "package:stats" "package:graphics" [7] "package:grDevices" "package:utils" "package:datasets" [10] "package:methods" "Autoloads" "package:base" Description Estimates a probability density function, regression function or their derivatives using local polynomials. A fast binned implementation over an equally-spaced grid is used. Usage locpoly(x, y, drv = 0L, degree, kernel = "normal", bandwidth, gridsize = 401L, bwdisc = 25, range.x, binned = FALSE, truncate = TRUE) ### For a general math-stat reference on this topic, see the book ### Local Regression and Likelihood, by Clive Loader, ### available online at ## http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/LocalRegressionBook_1999.pdf ### Here are some usage examples of the locpoly function. ### First we return to density estimation > plot(density(galaxies)) curve(kerfcn(x,2,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="red") > tmpfitg = locpoly(galaxies, degree=2, bandwidth=1) > names(tmpfitg) [1] "x" "y" > lines(tmpfitg$x, pmax(tmpfitg$y,0), lty=3, col="blue") > plot(tmpfitg$x, pmax(tmpfitg$y,0), type="l", xlim=c(5,40)) lines(locpoly(galaxies, degree=2, bandwidth=1), lty=2, col="blue") ### "degree" is ignored for density estimation ### DISADVANTAGE of this function is that there is no "predict" method for it, so ### it cannot be implemented directly as a function. ### ADVANTAGE is that derivatives can be estimated. > pts = seq(5,40, length=101) rnkx = c(outer(pts, galaxies, function(x,y) y <= x) %*% rep(1/82,82) ) tmpfitg2 = locpoly(pts, y=rnkx, drv=1L, degree=2, bandwidth=1) > lines(tmpfitg2) plot(density(galaxies)) curve(kerfcn(x,2,galaxies, kern=function(x) abs(x)<0.5),5,40, add=T, col="red") lines(tmpfitg2, lty=3, col="red") lines(locpoly(pts, y=rnkx, drv=1L, degree=2, bandwidth=2), lty=4, col="orange") #--------------------------------------------------------------------------- #### Now we move on to NONPARAMETRIC REGRESSION, where we can benefit from #### choice of degree and bandwidth. #### [Consider the use of the cross-validated density-estimation bandwidth for ### We illustrate throughout with dataset "geyser" in MASS (similar to "faithful" ### in base datasets. > dim(geyser) [1] 299 2 > plot(geyser) ### What kind of model does this suggest ??? We will return to this question ### after figuring out a curve for E(Y|X=x) around which to calculate residuals. ## One type of curve to "fit" is "polynomial regression" > tmp.lin = lm(duration ~ waiting, data=geyser) curve(predict(tmp.lin, newdata=cbind.data.frame(waiting=x)), c(40,110), add=T, col="blue") abline(tmp.lin$coef, col="red") ### for LINEAR regression, the prediction curve is the line ### Now consider regression on first, 2nd and 3rd powers of waiting > geyser3 = cbind.data.frame(waiting=geyser$wait, waitsq=geyser$wait^2, waitcb=geyser$wait^3, duration=geyser[2]) > names(geyser3) [1] "waiting" "waitsq" "waitcb" "duration" tmp.poly = lm(duration ~ ., data=geyser3) > summary(diff(geyser$wait)) #### WAIT IS NOT SORTED INCREASING !! Min. 1st Qu. Median Mean 3rd Qu. Max. -58.00000 -21.00000 -2.00000 -0.00336 24.00000 48.00000 ordw = order(geyser$wait) lines(geyser$wait[ordw], tmp.poly$fit[ordw], lty=2, col="orange") ### This curve gives some idea of Y given X behavior, but if we want E(Y|X=x) ### we can do much better. > plot(geyser) locfit0 = locpoly(geyser$wait, y=geyser$dur, degree=0, bandw=2) lines(locfit0, col="red") locfit3 = locpoly(geyser$wait, y=geyser$dur, degree=3, bandw=2) lines(locfit3, col="blue") ### Different behavior, gets crazy in gap between points in right tail locfit3B = locpoly(geyser$wait, y=geyser$dur, degree=3, gridsize=30, bandw = c(rep(2,27), 5,10,15)) lines(locfit3B, col="orange") ### Although the R function allows us to alter the bandwidth in different ### regions it is still somewhat hard to keep the function smooth. ### So let's try smoothing spline as another possibility. > splgeys = smooth.spline(geyser$wait, geyser$dur, spar=0.3) summary(diff(splgeys$x)) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.000 1.000 1.000 1.275 1.000 10.000 ### ordered increasing !! > length(splgeys$x) [1] 52 #### = number of unique geyser$waiting times splgeysA = smooth.spline(geyser$wait, geyser$dur, all.knots=T, spar=0.3) ### results not quite identical near endpoint, but otherwise seems yes > lines(splgeys$x,splgeys$y, lty=3, col="brown") ## OK, good fit but not very smooth splgeys2 = smooth.spline(geyser$wait, geyser$dur, spar=0.6) lines(splgeys2, lty=5, col="orange") lines(locfit0, col="red") ### contrast between spline and local polynomial degree 0 ### OK, now return to the problem of modeling: > plot(splgeys2$x, splgeys2$yin-splgeys2$y) abline(v=68, col="green", lty=8) abline(h=0, col="green", lty=8) ### Here our opinion of modeling might have changed. What do you think ? ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Final Topic/Trick: INVERSION USIING SPLINES ## Suppose we have a density f or cdf F given analytically and we want a function ## to approximate the inverse of F [eg for quick simulation] ## WE CAN FIT A FUNCTION TO EVALUATED POINTS (F(x[i]), x[i]) via smoothing-splines!! ## EXAMPLE: f(x) = dgamma(x, 1.3, 2.2) > curve(dgamma(x,1.3,2.2),0,3.5) curve(pgamma(x,1.3,2.2),0,3.5) > xpts = seq(0,5,length=251) ypts = pgamma(xpts,1.3,2.2) plot(ypts,xpts) Finv = function(x) predict(smooth.spline(ypts,xpts, spar=0.6), x=x)$y curve(Finv(x),0,1, col="red", add=T) rvec = runif(1000) summary( qgamma(rvec,1.3,2.2) - Finv(rvec) ) Min. 1st Qu. Median Mean 3rd Qu. Max. -3.135e-04 -1.185e-06 -1.677e-07 1.154e-05 1.502e-06 1.645e-03