HW 19 Stat 705  Fall 2016

Assigned Wednesday 11/30/16 DUE Friday 12/9/16 extended to Sunday 12/11
----------------------------------------------

Read in the dataset "precip" from the base-R datasets, containing
average amount of annual precipitation (rainfall) in inches for each 
of 70 United States (and Puerto Rico) cities. 

==================================================

> pts = seq(5,70, len=326)
  tmphist = hist(precip, nclass=10, prob=T, plot=F)
> sum(tmphist$density)*5
[1] 1

### Part (A)
##    (i) A kernel density estimate with dlogis(x) density and bandwidth 2.

  kermat1 = outer(pts, precip, function(x,y) dlogis(x,y,2))
  dens1 = cbind(pts, c(kermat1 %*% rep(1/70,70)))
  lines(dens1, lty=2)

### (ii) Kernel density estimate with the non-symmetric density dchisq(x+2,4), 
### also with bandwidth 2.
###    Comment on the density estimates (i) versus (ii) as a good visual 
### representation of the data, e.g. versus a histogram with 10 bars.

  kermat2 = outer(pts, precip, function(x,y) 0.5*dchisq(2 + (x-y)/2,4))
  dens2 = cbind(pts, c(kermat2 %*% rep(1/70,70)))
  lines(dens2, lty=3, col="blue")

### With the centered chisq kernel, the resulting plotted density estimate 
###   is a little bit shifted to the right, and altogether somewhat less 
##    smooth for the same nominal bandwidth.

### BUT as one student pointed out, dchisq(2+x,4) is not a density with mean 
## while  dchisq(4+x,4) or dchisq(2+x,2) would be.
## Any student who uses the better choice [one of these] will get full credit

 kermat2B = outer(pts, precip, function(x,y) 0.5*dchisq(4 + (x-y)/2,4))
  dens2 = cbind(pts, c(kermat2B %*% rep(1/70,70)))
  lines(dens2, lty=3, col="blue")

### (iii) Find the best kernel-density estimate with dnorm(x) kernel for these 
data, optimized over the bandwidth h with respect to the Integrated-Squared Error
criterion over the interval (0,80) for x, using the kernel-window  dnorm(x,y,h) 
and approximating to the empirical distribution for the "precip" data.

> lLk = function(parm, datvec) {
   ### here parm is a vector consisting of (qlogis(p1), qlogis(p2/(1-p1)),      
      ###        mu1,mu2,mu3,sig1,sig2,sig3)
   p1 = plogis(parm[1])
   p2 = (1-p1)*plogis(parm[2])
   sum(log(p1*dnorm(datvec,parm[3], parm[6]) + 
           p2*dnorm(datvec,parm[4], parm[7]) + 
           (1-p1-p2)*dnorm(datvec,parm[5],parm[8])))  }

> tmpft3 = nlm(function(w)  -lLk(w,precip), c(qlogis(0.2), qlogis(.7/.8), 
             15,40,57,5,10,3))
> tmpft3
$minimum
[1] 273.2866

$estimate
[1] -1.4762604  3.4370269 12.8896651 39.2815427 59.5029519  4.1454275  8.8230009
[8]  0.2999855

$gradient
[1]  6.391831e-08 -4.713485e-08  3.146535e-08  1.414518e-08  7.615685e-08
[6] -9.831732e-08 -9.296730e-08 -1.916476e-06

$code
[1] 1

$iterations
[1] 27

#---------------
### NOTE: since some of you (Stat 705 students in Fall 2016) used the 
###   function I coded in the DensEst.Log  log I coded in 2009, here 
###   is the result from that:

> lLk2 = function(x, pwt, mn, scal, dens = dlogis) {
   invsc = 1/scal
   dnsmat = invsc*matrix(dens(invsc*
        outer(mn,x,"-")), ncol=length(x))
   sum(log(pwt %*% dnsmat)) }

> nlm(function(w)  -lLk2(precip, c(plogis(w[1]),(1-plogis(w[1]))*
      plogis(w[2]), (1-plogis(w[1]))*(1-plogis(w[2]))), w[3:5],w[6:8], 
      dens=dnorm), c(qlogis(0.2), qlogis(.7/.8), 15,40,57,3,8,2), 
      stepmax=10, steptol=1.e-8, gradtol=1.e-8)

... essentially identical answer: but note that you needed a slightly
different starting value without transforming the scal parameter to 
be equal to  exp(logscal)  with this alternative lLk2 code ...

##----------- Solution continued

##   (iv) Fit the Maximum Likelihood estimated mixture of 3 normally distributed
## components based on the "precip" dataset. (Find the maximum likelihood
## mixture-distribution parameters either by numerical maximization or EM.)
##    Explain whether and why you think that a 3-component mixture was a good 
## choice for these data.

> dens3 = function(x) {
        p1=plogis(-1.47626)
        p2=(1-p1)*plogis(3.43703)
        p1*dnorm(x,12.88967, 4.14543) + p2*dnorm(x,39.28154, 8.82300) + 
           (1-p1-p2)*dnorm(x,59.50295,0.29999) }
 curve(dens3(x), add=T, lty=6, col="orange")

    ### 2 of the components are very good, but the third is small and too peaked
### The reason for this very sharp and small third peak, which is not well 
### displayed  by the histogram, are two very close data points:

> precip[abs(precip-60)<2]
   Miami San Juan 
    59.8     59.2 

### Solution of (iii) concluded in the next steps:
 
### Next create the piecewise constant function given by the histogram
> hstdns = function(x) {
              out=numeric(length(x))
              ind = (x>=5 & x<70)
              out[ind] = tmphist$density[x[ind] %/% 5] 
              out  }
> integrate(hstdns,0,80)
1 with absolute error < 1.1e-14

> densIMSE = function(h) {
            Intg1 = function(z) c(outer(z,precip, function(x,y) 
                        dnorm(x,y,h)) %*% rep(1/70,70))
            integrate(function(x) (Intg1(x)-hstdns(x))^2,0,80)$val }
> densIMSE(2)
[1] 0.0009978848
> densIMSE(3)
[1] 0.001164282
> densIMSE(1.5)
[1] 0.001118301

> optimize(densIMSE,c(0.2,10))
$minimum
[1] 1.965457                   #### the "optimal" bandwidth is close to 2

$objective
[1] 0.000990904

### (B) Hand in the R code which you used in answering the questions in (A), and 
## display all of the density curves on a single plot, if possible, containing 
## also the scaled relative frequency histogram for the "precip" data (with 10 
## or 12 bars). Make sure that your plot is properly labeled, using color and/or 
## line-types to distinguish the different density curves, and make sure your plot 
## has a descriptive legend. If you cannot fit all of this stuff into one readable
## picture use two pictures.

> hist(precip, nclass=10, prob=T, main="Histogram of  precip  with density est's")
  lines(dens1, lty=2)
  lines(dens2, lty=3, col="blue")
  curve(dens3(x), add=T, lty=6, col="orange")
  kermat4 = outer(pts, precip, function(x,y) dnorm(x,y,1.96546))
  lines(cbind(pts, c(kermat4 %*% rep(1/70,70))), lty=1, col="green")
  
> legend(locator(), legend=c("logistic ker dens, h=2", "chisq4 ker dens, h=2",
         "mix norm dens, ML", "norm ker dens, h opt"), lty=c(2,3,6,1),
         col=c("black","blue","orange","green"))