HW19 Solution library(MASS) ## (A) labeled scatterplot plot(anorexia[,2:3], type="n", ylim=c(71,106), main = "Nonpar. Regr. lines for Anorexia Pre & Post Wt") for(i in 1:3) points(anorexia[as.numeric(anorexia$Treat)==i,2:3], pch=i) ## (B) three different nonparametric regression curves > tmpfr = cbind(anorexia, Prewtsq = anorexia$Prewt^2, Prewtcb = anorexia$Prewt^3) fit1 = lm(Postwt ~ Prewt + Prewtsq + Prewtcb, data=tmpfr) curve(fit1$coef[1]+fit1$coef[2]*x+fit1$coef[3]*x^2+fit1$coef[4]*x^3, 69, 96, add=T, lty=2, col="blue") fit2 = smooth.spline(tmpfr[,2:3], spar=0.7) lines(fit2$x,fit2$y, lty=4, col="red") ### the fitted spline follows the plotted points much less smoothly > library(KernSmooth) fit3 = locpoly(x=tmpfr$Prewt, y=tmpfr$Postwt, degree=2, bandwidth = 1.5) lines(fit3, lty=6, col="orange") > legend(70.5,106, legend=c("Cubic polynomial regr","Cubic smoothing spline", "Loc Poly Regr, deg 2"), lty=c(2,4,6), col=c("blue","red","orange")) ### (C) > fdens = function(t) t/(1+t^3+t^5) const = 1/integrate(fdens,0,Inf, rel.tol=1.e-8)$val ### = 1.936945 pts = c(0,1/(1-(1:5000)/5001)^0.25-1,20) > summary(diff(pts)) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000050 0.000072 0.000119 0.003999 0.000283 12.590615 cdf = numeric(5001) for(i in 1:5001) cdf[i] = integrate(fdens, pts[i], pts[i+1], rel.tol=1.e-9)$val cdf = cumsum(cdf)*const > cdf[5001] [1] 0.9999194 > tmp = smooth.spline(cdf,pts[-1], spar=0.6, all.knots=T) #### weird kinks can appear in this fit unless you use all knots Finv = function(x) predict(tmp, x=x)$y plot(pts[-1], Finv(cdf)) #### This chows that when we evaluate our Finv at the cdf values, #### we get something very close to the identity, as we should: > summary(pts[-1] - Finv(cdf) Min. 1st Qu. Median Mean 3rd Qu. Max. -3.051e-04 0.000e+00 0.000e+00 -1.370e-07 0.000e+00 7.331e-05 ### So this accuracy looks pretty good > randomX = Finv(runif(1e4)) ### approximately distributed according to f aux = numeric(1e4) for(i in 1:1e4) aux[i] = integrate(fdens,0,randomX[i])$val*const round(quantile(randomX - Finv(aux), prob=c(0,.005,.01,.05,.1,.25,.5,.75, .9,.95,.99,.995,1)),6) 0% 0.5% 1% 5% 10% 25% 50% 75% -4.937114 -0.000043 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 90% 95% 99% 99.5% 100% 0.000000 0.000000 0.000000 0.000008 0.582118 ### This shows very high accuracy except in the extremes. > ordX = order(randomX) plot(randomX, Finv(aux)) ### the visible errors all occur for randomX > 5 > rbind(upto3=summary((randomX - Finv(aux))[randomX <= 3]), from3to4=summary((randomX - Finv(aux))[abs(randomX-3.5) <= 0.5]), from4to5=summary((randomX - Finv(aux))[abs(randomX-4.5) <= 0.5]) ) Min. 1st Qu. Median Mean 3rd Qu. Max. upto3 -1.772637e-09 -8.881784e-16 1.254552e-14 8.380986e-10 1.549760e-12 2.537099e-07 from3to4 -4.396437e-05 1.643270e-08 1.545706e-07 -7.987336e-07 4.662079e-07 1.524516e-05 from4to5 -9.611826e-03 -4.054193e-04 1.183167e-04 -4.034783e-04 1.007849e-03 4.627660e-03 ### To get much better accuracy at values > 5 we would have needed some more knots there ! > sum(pts[-1]>5) [1] 4