HW 20 Stat 705  Solution

## (A) 
> library(faraway)
  Fat = fat[-c(39,42),c("height","weight")]
  dim(Fat)
[1] 250   2

> fit1 = lm(weight ~ height, data=Fat)
> round(summary(fit1)$coef,5)
              Estimate Std. Error  t value Pr(>|t|)
(Intercept) -194.48613   39.62298 -4.90842        0
height         5.29948    0.56321  9.40934        0

> apply(Fat,2,mean)
  height   weight 
 70.3030 178.0832 

> summary(fit1)$sig
[1] 23.2551

> abline(6,2.5, col="blue")
> abline(Fat$coef[1],Fat$coef[2], col="orange")
> sqrt(quantile(1/rgamma(1e5,.4,100), prob=c(.05,.25,.5,.75,.95)))
       5%       25%       50%       75%       95% 
  7.73515  14.07062  26.07546  64.51002 484.26703 
   
     ### So there is a LOT of uncertainty in the standard deviation, which 
### allows our prior for (a,b) to be so different from the fitted values 
### and yet somewhat compatible with the data.

## Let x = Fat$height and y = Fat$weight , both 250-dimensional vectors.

## From discussion in class and direct calculation we find that the posterior 
## density of the parameters is proportional to :

exp(-(a-6)^2/8 - 2*(b-2.5)^2) * tau^(.4-1) * exp(-100*tau) * tau^(250/2) *

  exp( -(tau/2)*sum_{j=1}^{250} (y[i]-a-b*x[i])^2 )

## Therefore the full conditional posterior densities are:

tau given (a,b,x,y)    ~  Gamma(125.4, 100 + 0.5*sum((y-a-b*x)^2) )

a   given (tau,b,x,y)  ~  N( (3+500*tau*(mean(y)-b*mean(x))/(0.5+500*tau) , 
                             1/(0.25 + 250*tau) )     ## this is variance not sd

b   given (tau,a,x,y)  ~  N( (10+tau*sum(x*(y-a)))/(4+tau*sum(x^2)),
                             1/(4+tau*sum(x^2)) )

## Therefore we code the Gibbs sampler, starting from the least-squares fit, 
## as follows (taking the parameters in the order a,b,tau):

  parm = c(fit1$coef, 1/summary(fit1)$sig^2)
   x = Fat$height; y = Fat$weight
   xbar = mean(x)
   ybar = mean(y)
   sx2 = sum(x^2)
   sxy = sum(x*y)
  parMat = array(0, c(1.5e4, 3))
  for(k in 1:1.5e4) {
      parm[1] = (3+500*parm[3]*(mean(y)-parm[2]*mean(x)))/(0.5+500*parm[3]) + 
                  sqrt(1/(0.25+250*parm[3]))*rnorm(1)
      parm[2] = (10+parm[3]*(sxy-250*parm[1]*xbar))/(4+parm[3]*sx2) + 
                  sqrt(1/(4+parm[3]*sx2))*rnorm(1) 
      parm[3] = rgamma(1, 125.4, 100 + 0.5*sum((y-parm[1]-parm[2]*x)^2))
      parMat[k,] = parm }

> hist(parMat[5001:15000,1], nclass=50, prob=T, xlab="a", main="")
> hist(parMat[5001:15000,2], nclass=50, prob=T, xlab="b", main="")
> hist(1/sqrt(parMat[5001:15000,3]), nclass=50, prob=T, xlab="sig", main="")

### These values are respectively centered near:
> round(c(median(parMat[5001:15000,1]), median(parMat[5001:15000,2]),
    1/sqrt(median(parMat[5001:15000,3]))), 4)
[1]  5.4918  2.4595 24.3802

### We can expect that this would not change with another several blocks 
###   of 10000 Gibbs-Sampler iterations.

### (i)-(ii) smoothed posterior density estimates of  a, b, and sig , overplotted
### with normal densities of same mean and variance

> kerfcn = function(xnew, bw, pts,  kern=function(x) dnorm(x)) {
       n = length(pts)
       c(outer(xnew, pts, function(x,y) kern((x-y)/bw)/bw) %*% rep(1/n,n)) }

> tmp = parMat[5001:15000,1]
  hist(tmp, nclass=60, prob=T, xlab="Intercept Param = a", main=
     "Overplotted Hist, Density, Normal Approx.")  
  curve(kerfcn(x,.25, tmp), -2, 15, add=T, col="orange")
  curve(dnorm(x,mean(tmp),sd(tmp)),-2,15, add=T, col="blue")
          ### Very good approximation here

> tmp = parMat[5001:15000,2]
  hist(tmp, nclass=60, prob=T, xlab="Slope Param = b", main=
     "Overplotted Hist, Density, Normal Approx.")  
  curve(kerfcn(x,.02, tmp), 2.3, 2.6, add=T, col="orange")
  curve(dnorm(x,mean(tmp),sd(tmp)),2.3,2.6, add=T, col="blue")
   ### Here the approximation is not so good: posterior is slightly less peaked
   ### and more spread than the approximating normal with same mean and variance

### 
> tmp = 1/sqrt(parMat[5001:15000,3])
  hist(tmp, nclass=60, prob=T, xlab="SD Param = sig", main=
     "Overplotted Hist, Density, Normal Approx.")  
  curve(kerfcn(x,.2, tmp), 20, 30, add=T, col="orange")
  curve(dnorm(x,mean(tmp),sd(tmp)),20,30, add=T, col="blue")
   ### Here the approximation is again very good.

### (iii) the posterior medians
> round(c(median(parMat[5001:15000,1]), median(parMat[5001:15000,2]),
    1/sqrt(median(parMat[5001:15000,3]))), 4)
[1]  5.4918  2.4595 24.3802

### (iv) 90%  Bayesian credible intervals 
> round(apply(cbind(parMat[5001:15000,1:2], 1/sqrt(parMat[5001:15000,3])),2,
      function(col) quantile(col, prob=c(.05,.95))), 3)
     [,1]  [,2]   [,3]
5%  2.198 2.401 22.723
95% 8.822 2.517 26.328

### (v) 90% (frequentist, least-squares) confidence intervals calculated from 
###		the original "fat" dataset.
> inta = fit1$coef[1] + c(-1,1)*qnorm(.95)*summary(fit1)$coef[1,2]
[1] -259.6601 -129.3121
  intb = fit1$coef[2] + c(-1,1)*qnorm(.95)*summary(fit1)$coef[2,2]
[1] 4.373074 6.225885

### posterior probability that a and b BOTH lie within these intervals
> mean(parMat[5001:15000,1]>inta[1] & parMat[5001:15000,1] < inta[2] &
       parMat[5001:15000,2]>intb[1] & parMat[5001:15000,1] < intb[2])   ### = 0
		
	
### (B)  The purpose of the generation of new "pseudo"-datasets in the last part is 
## to see whether the "posterior predictive" data-generating mechanism gives data 
## with the same general qualitative properties as the original dataset. You can 
## think of this as a kind of goodness-of-fit check.

## "Posterior-predictive" means that we want to use the same model, but with 
## parameters (a,b,tau) that are consistent with what we know about those parameters 
## after looking at the dataset and computing the posterior. Thus,the parameters 
## (a,b,tau) should be drawn from the POSTERIOR not the prior ! Then, from the 
##  parameters, we generate brand-new data using the conditional density f(y|a,b,tau,x).

param = parMat[c(14200, 14400, 14600, 14800),]
> param
         [,1]     [,2]        [,3]
[1,] 3.285990 2.489228 0.001947559
[2,] 7.597364 2.397629 0.001975801
[3,] 1.498228 2.535951 0.001572550
[4,] 1.802602 2.526802 0.001551472

### Recall that the 3rd column is tau=1/sig^2

## Now simulate 4 datasets y = weight of size 250, holding x = height fixed.

> NewDat = array(0, c(250,4))
  for(i in 1:4) NewDat[,i] = param[i,1]+param[i,2]*Fat$height+ rnorm(250)/sqrt(param[i,3])

> plot(Fat$height, NewDat[,1])
  points(Fat$height, Fat$weight, pch=18)
        ### The empty circlesare the posterior predictive pseudo-data;
  ### they look very much like the original data (solid diamonds) except for 
  ### extra points along the lower boundary of the point cloud !

> plot(Fat$height, NewDat[,2])
  points(Fat$height, Fat$weight, pch=18)
   ## similar phenomenon, the old points (black diamonds) are slightly higher
  ## and to the left of the 2nd sample of pesudo-data.

> plot(Fat$height, NewDat[,3])
  points(Fat$height, Fat$weight, pch=18)
  abline(fit1$coef, col="red")
  ## now the old points are generally more concentrated toward the fit1 line

> plot(Fat$height, NewDat[,4])
  points(Fat$height, Fat$weight, pch=18)
   ### similar pattern as for plots of pseudo datasets 1 and 2.