Solution Log for HW7 problem in Stat 798c
========================================= 5/14/03

> HW7dat <- scan("/home1/evs/public_html/s798c/Data/HW7data")
> summary(HW7dat)			### vector of lgth 500
     Min. 1st Qu. Median  Mean 3rd Qu. Max. 
 0.003383  0.1992 0.5597 1.161   1.388   15

> motif()
> hist(HW7dat, nclass=60, prob=T)
(a) Kernel-density method, dlogis with bandwidth .1  .

> pts7 <- sort(HW7dat)[(1:49)*10 + 5]
  kerMat7 <- outer(pts7, HW7dat, function(x,y) dlogis(x,y,.1))
  lines(pts7, c(kerMat7 %*% rep(.002,500)) , lty=3)
### The plot shows this is VERY hard to fit way out in the tail where
###    the density is small.

(b) Smoothing-spline method.
> spl7 <- smooth.spline(sort(HW7dat), (1: 500)/500, 
     spar=5.e-7, all.knots=F)
  spldns7 <- predict.smooth.spline(spl7, c(sort(HW7dat)[
     c((1:49)*10,491:500)],15) -> aux7, deriv=1)  
  lines(aux7, spldns7$y)

(c) Recall the previously defined function
> lLk 
function(x, pwt, mn, scal, dens = dlogis) {
   invsc <- 1/scal
   dnsmat <- invsc*matrix(dens(invsc*
        outer(mn,x,"-")), ncol=length(x))
   sum(log(pwt %*% dnsmat)) }
### But note that there is no hope to fit normal to the data-values
###   themselves: instead fit to their logs, or alternatively get
###   mixture of log-normal densities

> lgdt7 <- log(HW7dat)
> tmpft7 <- nlmin(function(w)  -lLk(lgdt7,c(plogis(w[1]),
   (1-plogis(w[1]))*plogis(w[2]),(1-plogis(w[1]))*
   (1-plogis(w[2]))), w[3:5], exp(w[6:8]), dens=dnorm), 
   c(1,0,log(c(.5,3,6)),log(c(.1,.3, .2))), 
   print.level=1, max.fcal=300, max.iter=100)
     0     1
    it    nf      f       reldf      preldf     reldx
     0     1  1.208e4
     1     2  2.610e3    7.840e-1   1.743e0    2.097e-1
     2     4  1.184e3    5.462e-1   6.445e-1   3.158e-1
     3     5  1.013e3    1.450e-1   1.023e0    4.263e-1
     4     7  9.908e2    2.149e-2   1.075e-1   8.528e-2
     5     8  9.156e2    7.589e-2   1.340e-1   1.061e-1
     6     9  8.950e2    2.254e-2   3.941e-2   8.010e-2
...
    49    63  8.674e2    1.385e-10  1.016e-10  1.470e-2
    50    64  8.674e2    7.263e-11  5.124e-11  1.428e-2
 ***** relative function convergence *****
 function     8.673897e2    reldx        1.4276e-2
 function evaluations:  64   gradient evaluations: 571
 preldf       5.1237e-11  npreldf      5.1295e-11

### Now we have parameters for the mixture of normals 
##    on the logs of the observations:
> tmpft7$x
[1] -22.6712479   0.5620639   4.7054183  -1.2237658   0.3266433  
[6]  15.0291864   0.2808054  -0.1560827

## So the weights are plogis(-22.67) = 0, plogis(.56206) = .6369, 
##   and .3631. The two surviving components have
##   means  -1.22377  and  0.322664, and variances
> exp(tmpft7$x[7:8])
[1] 1.3241959 0.8554884

> hist(lgdt7, nclass=60, prob=T)
> lines(-6 + 9*(1:200)/200 -> xt , .6369*dnorm(xt,-1.22377,
    1.32420) + .3631*dnorm(xt,.322664,.855488))
### Not a very good fit: but not a bad one to the derivative of the
###   smoothed df.

(d) Upper quartiles: 
### First superpose the df's in place of normal densities and solve 
###   quantile equation

> Fest7 <- function(x) {
      tmpm <- outer(x,HW7dat, function(x,y)
          .002*pnorm(10*(x-y)))
      c(tmpm %*% rep(1, 500)) - 0.75 }
> Fest7(1:2)
[1] -0.09710544  0.07532441
> uniroot( Fest7, interval=c(1,2))$root
[1] 1.418043	     #### Estimated root for kernel-density method

> uniroot( function(x) predict.smooth.spline(spl7,x)$y-.75, 
     interval=c(1,2))$root
[1] 1.399578	     ### Estimated upper quartile using splines

> exp(uniroot(function(x) .6369*pnorm(x,-1.22377, 1.32420) 
   + .3631*pnorm(x,.322664,.855488) -0.75, log(1:2))$root)
[1] 1.427958	     
     ### Estimated upper quartile from (log-)normal mixture

(e) Bootstrap estimate (1000 replications):
### First generate a confidence interval using the spline 
> tmp7 <- smooth.spline((1:500)/500, sort(HW7dat), spar=5.e-7, 
      all.knots=F)
### Now generate data in blocks of 100.
> pboot7 <- numeric(1000)
  for(i in 1:10) {
     tmpm <- matrix(predict.smooth.spline(tmp7,runif(50000))$y, 
           ncol=100)
     pboot7[(i-1)*100+1:100] <- apply(tmpm,2,
           function(vec) quantile(vec,.75))}
### Now pboot7 contains 1000 upper-quartile values: the mean is 
###   the bootstrap estimate,
> mean(pboot7)
[1] 1.417956 
### and the empirical .025 to .975 quantile interval is the
### confidence interval
> quantile(pboot7,c(.025,.975))
     2.5%    97.5% 
 1.235356 1.741055        ### VERY WIDE !!!

### Next the nonparametric bootstrap:
> npboot7 <- apply(matrix(sample(HW7dat,5e5, replace=T), ncol=1000),2, 
       function(vec) quantile(vec,.75))
> c(quantile(HW7dat,.75), mean(npboot7))
     75%          
 1.388275 1.416397  ### respectively, the upper quartile for this
 ### dataset and the bootstrap estimated upper quartile
### Now comes the nonparametric bootstrap 95% CI:
> 1.388275 - quantile(npboot7-1.388275,c(.975,.025))
    97.5%     2.5% 
 1.062359 1.532689   ### Note the asymmetry !!!