HW19 Solution

library(MASS)

## (A) labeled scatterplot

plot(anorexia[,2:3], type="n", ylim=c(71,106), main =
  "Nonpar. Regr. lines for Anorexia Pre & Post Wt")
for(i in 1:3) points(anorexia[as.numeric(anorexia$Treat)==i,2:3], pch=i)

## (B) three different nonparametric regression curves

> tmpfr = cbind(anorexia, Prewtsq = anorexia$Prewt^2, Prewtcb = anorexia$Prewt^3)
  fit1 = lm(Postwt ~ Prewt + Prewtsq + Prewtcb, data=tmpfr)
  curve(fit1$coef[1]+fit1$coef[2]*x+fit1$coef[3]*x^2+fit1$coef[4]*x^3, 
        69, 96, add=T, lty=2, col="blue")
  fit2 = smooth.spline(tmpfr[,2:3], spar=0.7)
  lines(fit2$x,fit2$y,  lty=4, col="red")
      ### the fitted spline follows the plotted points much less smoothly
> library(KernSmooth)
  fit3 = locpoly(x=tmpfr$Prewt, y=tmpfr$Postwt, degree=2, bandwidth = 1.5)
  lines(fit3, lty=6, col="orange")

> legend(70.5,106, legend=c("Cubic polynomial regr","Cubic smoothing spline",
     "Loc Poly Regr, deg 2"), lty=c(2,4,6), col=c("blue","red","orange"))

### (C) 

> fdens = function(t) t/(1+t^3+t^5)
  const = 1/integrate(fdens,0,Inf, rel.tol=1.e-8)$val         ###  = 1.936945

  pts = c(0,1/(1-(1:5000)/5001)^0.25-1,20)
> summary(diff(pts))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 0.000050  0.000072  0.000119  0.003999  0.000283 12.590615 

  cdf = numeric(5001)
  for(i in 1:5001) cdf[i] = 
        integrate(fdens, pts[i], pts[i+1], rel.tol=1.e-9)$val
  cdf = cumsum(cdf)*const
> cdf[5001]
[1] 0.9999194

> tmp = smooth.spline(cdf,pts[-1], spar=0.6, all.knots=T)                
        #### weird kinks can appear in this fit unless you use all knots

  Finv = function(x) predict(tmp, x=x)$y

  plot(pts[-1], Finv(cdf))
     #### This chows that when we evaluate our Finv at the cdf values, 
     ####   we get something very close to the identity, as we should:
> summary(pts[-1] - Finv(cdf)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-3.051e-04  0.000e+00  0.000e+00 -1.370e-07  0.000e+00  7.331e-05 

### So this accuracy looks pretty good

> randomX = Finv(runif(1e4))    ### approximately distributed according to f
  aux = numeric(1e4)
  for(i in 1:1e4) 
          aux[i] = integrate(fdens,0,randomX[i])$val*const 
  round(quantile(randomX - Finv(aux), prob=c(0,.005,.01,.05,.1,.25,.5,.75,
              .9,.95,.99,.995,1)),6)
       0%      0.5%        1%        5%       10%       25%       50%       75% 
-4.937114 -0.000043  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000 
      90%       95%       99%     99.5%      100% 
 0.000000  0.000000  0.000000  0.000008  0.582118 

### This shows very high accuracy except in the extremes. 
> ordX = order(randomX)
  plot(randomX, Finv(aux))
     ### the visible errors all occur for randomX > 5
> rbind(upto3=summary((randomX - Finv(aux))[randomX <= 3]),
        from3to4=summary((randomX - Finv(aux))[abs(randomX-3.5) <= 0.5]),       
        from4to5=summary((randomX - Finv(aux))[abs(randomX-4.5) <= 0.5]) )
                  Min.       1st Qu.       Median          Mean      3rd Qu.         Max.
upto3    -1.772637e-09 -8.881784e-16 1.254552e-14  8.380986e-10 1.549760e-12 2.537099e-07
from3to4 -4.396437e-05  1.643270e-08 1.545706e-07 -7.987336e-07 4.662079e-07 1.524516e-05
from4to5 -9.611826e-03 -4.054193e-04 1.183167e-04 -4.034783e-04 1.007849e-03 4.627660e-03

### To get much better accuracy at values > 5 we would have needed some more knots there !
> sum(pts[-1]>5)
[1] 4