Homework Solutions Logs.
========================

(1) 
> V1 <- rep(c(0,1,1),40)
  V2 <- sin(2*pi*sqrt(1:120))
### A better version (used below) would be:   
###       V2 <- round(sin(2*pi*sqrt(1:120)),1e-12)

  V3 <- ifelse(V2>=0, "H", "L")
  newmat <- cbind(Group = V1, Y = V2, Type = V3) 
  newfram <- cbind.data.frame(Group = V1, Y = V2, Type = V3,
      stringsAsFactors=F) 
### Note that the "stringsAsFactors=F" optional argument is
###   needed to avoid automatic conversion of character data
###   to factor-type. 
> unlist(lapply(newfram,class))
     Group         Y        Type 
 "integer" "numeric" "character"

> newfram[1:5,]
  Group  Y Type 
1     0  0    H
2     1  1    H
3     1 -1    L
4     0  0    H
5     1  1    H
> newmat[1:5,]
     Group    Y Type 
[1,] "0"   "0"  "H" 
[2,] "1"   "1"  "H" 
[3,] "1"   "-1" "L" 
[4,] "0"   "0"  "H" 
[5,] "1"   "1"  "H" 



(2) 

> exmpfram <- read.table(file="exampfram.txt", stringsAsFactors=F, 
    skip=1)
> dim(exmpfram)
[1] 216  13
> exmpfram <- exmpfram[,3:9]
> names(exmpfram) <- c("IDNUM","DTH","EVTTIME","TRTGP","LOGBILI",
    "AGEVAR","CIRRH")
> exmpfram$CIRRH <- exmpfram$CIRRH == "T"
> class(exmpfram$CIRRH)             
[1] "logical"

> exmpfram2 <- scan(file="exampfram.txt", skip=1)
### This is a giant vector, of type "character" and length 216*13=2808
### We are interested in pulling out the elements 3:8 in each row,
### which now correspond to elements with indices 3:8 plus 
### multiples 0:215 of 13.

> inds <- rep(3:9,216) + rep(13*(0:215),rep(7,216))
  inds2 <- c(outer(3:9, 13*(0:215), "+"))
> sum(abs(inds-inds2))
[1] 0
> exmpfram2 <- data.frame(matrix(exmpfram2[inds], ncol=7, byrow=T,
     dimnames=list(NULL,  names(exmpfram))), stringsAsFactors=F)
> exmpfram2[,7] <- as.logical(exmpfram2[,7])
> for(i in c(1:3,5:6)) exmpfram2[,i] <- as.numeric(exmpfram2[,i])
> unlist(lapply(exmpfram2,class))
     IDNUM       DTH   EVTTIME       TRTGP   LOGBILI    AGEVAR     CIRRH 
 "numeric" "numeric" "numeric" "character" "numeric" "numeric" "logical"

> c(sum(exmpfram == exmpfram2), sum(unlist(lapply(exmpfram2,class))==
      unlist(lapply(exmpfram,class))))
[1] 1512  7
### solves part (c), since all 216*7 entries match (including type)

> t(apply(exmpfram[,c(1:3,5:6)],2,range))     ### solution to part (d)
           [,1]      [,2] 
  IDNUM 1.00000 248.00000
    DTH 0.00000   1.00000
EVTTIME 0.00000 139.44000
LOGBILI 0.69897   2.72263
 AGEVAR 1.64900 330.30000   ### same matrix is produced by:
> cbind(apply(exmpfram[,c(1:3,5:6)],2,min),
      apply(exmpfram[,c(1:3,5:6)],2,max))


(3) (a)

> matcols <- function(xvec, outcols) {
    outmat <- cbind(rep(1,length(xvec)), xvec)
    if(outcols > 1) for(i in 2:outcols) 
	   outmat <- cbind(outmat, xvec^i)
    outmat 
}
> matcols(1:3,3)
       xvec      
[1,] 1    1 1  1
[2,] 1    2 4  8
[3,] 1    3 9 27
> matcols(2:5,1)
       xvec 
[1,] 1    2
[2,] 1    3
[3,] 1    4
[4,] 1    5


(b) 
> convtype <- function(infram, charac=NULL, numer=NULL, bool=NULL){
    tmpfram <- infram
    if(!is.null(charac) && charac == "factor") {
       charcols <- unlist(lapply(tmpfram, is.character))
       if(sum(charcols)) for(i in (1:ncol(tmpfram))[charcols]) 
              tmpfram[,i] <- factor(tmpfram[,i]) }
     if(!is.null(numer) && numer == "charac") {
       numcols <- unlist(lapply(tmpfram, is.numeric))
       if(sum(numcols)) for(i in (1:ncol(tmpfram))[numcols]) 
              tmpfram[,i] <- as.character(tmpfram[,i]) }
     if(!is.null(bool) && (bool=="charac" | bool=="numer")) {
          boolcols <- unlist(lapply(tmpfram, is.logical))
          if(bool=="charac") 
              for(i in (1:ncol(tmpfram))[boolcols]) 
                  tmpfram[,i] <- as.character(tmpfram[,i])
          else  for(i in (1:ncol(tmpfram))[boolcols]) 
                  tmpfram[,i] <- as.numeric(tmpfram[,i]) }
      tmpfram
   }

> auxfram <- cbind.data.frame(factor(as.character(1:10)), 
     rep(c(3,5),5), rep(c(3,5),5) < 4, letters[12:21],
     stringsAsFactors=F) 
> auxfram
   X1 X2 X3 X4 
 1  1  3  T  l
 2  2  5  F  m
 3  3  3  T  n
 4  4  5  F  o
 5  5  3  T  p
 6  6  5  F  q
 7  7  3  T  r
 8  8  5  F  s
 9  9  3  T  t
10 10  5  F  u
> unlist(lapply(auxfram,class))
       X1        X2        X3          X4 
 "factor" "integer" "logical" "character"

> convtype(auxfram, charac="factor")
   X1 X2 X3 X4 
 1  1  3  T  l
 2  2  5  F  m
 3  3  3  T  n
 4  4  5  F  o
 5  5  3  T  p
 6  6  5  F  q
 7  7  3  T  r
 8  8  5  F  s
 9  9  3  T  t
10 10  5  F  u
> unlist(lapply(.Last.value,class))
       X1        X2        X3       X4 
 "factor" "integer" "logical" "factor"

> convtype(auxfram, numer="charac", bool="numer")
   X1 X2 X3 X4 
 1  1  3  1  l
 2  2  5  0  m
 3  3  3  1  n
 4  4  5  0  o
 5  5  3  1  p
 6  6  5  0  q
 7  7  3  1  r
 8  8  5  0  s
 9  9  3  1  t
10 10  5  0  u
> unlist(lapply(.Last.value,class))
       X1          X2        X3          X4 
 "factor" "character" "numeric" "character"


(4)
(a)
> SimLin <- function(errsd, coef, xvec=NULL, simx = NULL) {
### NB: required arguments are:
###    errsd = error standard deviation
###    coef  = vector with 2 components (intercept,slope) 
### Either vector  xvec  or   sim=(mean,sd,length)
###      parameter vector must be given non-null
     if(is.null(xvec)) xvec <- rnorm(simx[3],simx[1],simx[2])
     list( xvec = xvec, coef = coef, errsd = errsd, simx = simx,
         yvec =coef[1]+coef[2]*xvec+rnorm(length(xvec))*errsd)
  }
  
### eg  
> SimLin(0.5, c(-1,1), 1:5) -> slst1 
> round(slst1$yvec,3)
[1] -0.010  0.494  2.458  2.309  3.765

(b)
> slst2 <- SimLin(0.5, c(-1,1), xvec=runif(1000))
  slst3 <- SimLin(1, c(2,.2), simx= c(0,1,1000))

## For first case, show the numbers of xvec  obs in each interval
##   (k/10, (k+1)/10], and theoretical and observed mean, variance,
##   covariance among  xvec  and  yvec  values

> table(floor(slst2$xvec*10))
   0  1   2   3   4  5   6   7   8  9 
 110 93 104 100 104 99 113 101 105 71
## Fairly even distribution: BUT probability for a count <= 71 in a
##   specific interval is:
> pbinom(71,1000,.1)
[1] 0.0008603278	### This is too small !! Bad random numbers!
## The next 10 batches of runif(1000) numbers gave smallest interval
##   counts resp. equal to 85, 79, 93, 88, 93, 86, 79, 87, 85, 90

> round(c(mean(slst2$xvec), mean(slst2$yvec),
      var(cbind(slst2$xvec, slst2$yvec))), 4)
[1]  0.4871 -0.5194  0.0796  0.0724  0.0724  0.3273  
### cf. .5,   -.5     .0833   .0833   .0833   .3333    Theor. values

### Now exhibit residuals  epsilon  for  slst3 , through fractions 
### falling in ranges (-10,-1.5), (-1.5,-.5), (-.5,.5), (.5,1.5), (1.5,10)
> eps3 <- slst3$yvec - 2 - .2*slst3$xvec
  hist(eps3, breaks=c(-10,-1.5,-.5,.5,1.5,10), plot=F)
..
$counts:
[1]  72 234 374 241  79   ### compare with theoretical expected values
> round(diff(pnorm(c(-10,-1.5,-.5,.5,1.5,10)))*1000,2)
[1]  66.81 241.73 382.92 241.73  66.81          ### OK !!!

### And again: means and (co)variances followed by theoretical values
> round(c(mean(slst3$xvec), mean(slst3$yvec),
      var(cbind(slst3$xvec, slst3$yvec))), 4)
[1] 0.0137 2.0281 1.0110 0.1638 0.1638 1.0816
###    0     2      1      .2     .2    1.04   Theor. values

(5) Recall from class that in order to achieve precision delta with
probability at least 1-alpha, we apply the formula 
     B > = (qnorm(1-alpha/2)^2 * sigst^2 / delta^2
where sigst is any upper bound or estimate for the standard deviation
of the simulated iid variables.

(a) Since alpha=delta=1e-2, and since the variables X are
uniformly bounded between 0 and sqrt(3), so that sigst <= 3/4, our 
formula gives B >= 2.576^2*.75/1.e-4 = 49768.32.  So we use B=5e4.

> tmpvec <- runif(5e4)
  tmpvec <- sqrt(1+tmpvec+tmpvec^4)
  sigest <- sqrt(var(tmpvec))
  c(mean(tmpvec) + c(-1,1)*qnorm(.995)*sigest/sqrt(5e4), sigest/0.75)
[1] 1.2878769 1.2924819 0.2665071
### So the estimated integral is 1.2902, with precision a bit better
###   than required. The exact answer is given by:
> tmpl <- integrate(function(x) sqrt(1+x+x^4),0,1)
  c(tmpl$integral, tmpl$abs.error)
[1] 1.288417e+00 1.066592e-11		### OK, our error was +.00176

(b) Note that the r.v.'s of interest are indicators of whether or not
##   there are k distinct values in a given simulation. These are 
##   all binary (ie Bernoulli) so have sigsq <= 0.25, so for 
##   alpha = .01/6, get the formula B >= qnorm(1-.01/12)^2*.25/1.e-4
##   = 24711.53 .   (We divided by 6 --- using `Bonferroni's 
##   inequality' --- to ensure that the union of the seven event each
##   of which may have probability up to .01/6, will altogether have 
##   probability <= .01. So we can use  B = 25000; I used B=25500.
> tmpmat <- array(0, dim=c(25500,10))
    for(i in 1:6) {
     tmpvec <- sample(1:10,25500, replace=T)
     tmpmat[cbind(1:25500,tmpvec)] <- 1 }
  dvec <- c(tmpmat %*% rep(1,10))
> round(table(dvec)/25500, 4)
      2      3      4      5      6 
 0.0029 0.0629 0.3275 0.4593 0.1474
### Compare with true analytically calculated values:
 0.0028 0.0648 0.3276 0.4536 0.1512
### NB. The probability that only 1 distinct value occurs = 1e-5.

### The largest absolute error turns out to have been: .0057
### which indicates that our Bonferroni-inequality calculation
### was not terribly conservative.

*** Reasoning for exact calculation: for example, 
P(5 distinct) = 1e-6*( 10*{6 choose 2}*9*8*7*6)  =.4536.
*** The others can be done similarly (with more distinct cases).


(6)
> cathed <- read.table(file="Cathed.dat")
> motif()
> plot(cathed[,4], cathed[,3], type="n", xlab="Length (feet)",
    ylab="Height (feet)", main=
    "English Cathedral Heights vs Lengths, by Style")
  points(cathed[cathed[,2]==0,4],cathed[cathed[,2]==0,3], pch=5)
  points(cathed[cathed[,2]==1,4],cathed[cathed[,2]==1,3], pch=18)
  tm0 <- lm(V3 ~ V4, data=cathed[cathed[,2]==0,])$coef
  abline(tm0[1],tm0[2], lty=3)
  tm1 <- lm(V3 ~ V4, data=cathed[cathed[,2]==1,])$coef
  abline(tm1[1],tm1[2], lty=1)
  legend(locator(), legend=c("Romanesque","Gothic"),
       marks=c(5,18), lty=c(3,1))
  printgraph(file="Graphic2.ps")


(7) > motif()
> betvec <- rbeta(1600,2,3)
  hist(betvec, nclass=40, prob=T, xlab="Simulated variate scale",
    ylab="Scaled relative frequency", main=
   "Scaled Rel Freq Histogram: Beta Data & Overlaid Density")
  lines((1:200)/200, dbeta((1:200)/200,2,3), lty=2)

(8) (a) Here the point is that, in using the "bivariate probability
    integral transform" method discussed in class, you can work
    either with F_X and F_{Y|X} or with F_Y and F_{X|Y}. If the
    objective is to do everything analytically, then F_Y and F_{X|Y}
    works better, so that will be the way I do it first (Method 1).
    The other way works too, but uses a little more in the way of
    Splus trickery because F_X, while simple, cannot be analytically
    inverted. That will be Method 2, and I use the two methods to
    check each other.

## First we exhibit the marginal and conditional df's. 
> Fx <- function(x) 1 - 1.5/x^2 + .5/x^3
  Fy <- function(y) 1-1.5/y + .5/y^2
  Finvy <- function(r) .75*(1 + sqrt(1-8*(1-r)/9))/(1-r)
  FYcondX <- function(x,y) ifelse(y <= x,
    (y-1)/(2*x-1), 1 - x^2/(y*(2*x-1)))
  FXcondY <- function(x,y) ifelse(x <= y,
    1.5*(1-1/x)/(1.5-1/y), 1 - .5*(y/x)^3/(1.5*y-1))
  FYinvcondX <- function(r,x) ifelse(r <= FYcondX(x,x),
    (2*x-1)*r+1, x^2/((2*x-1)*(1-r)))
  FXinvcondY <- function(r,y) ifelse(r <= FXcondY(y,y),
    1/(1-r*(1-2/(3*y))), y*(.5/((1-r)*(1.5*y-1)))^(1/3))

## NB we can test these inverses simply, eg by generating 
##   arbitrary X values on (1,infty) and R on (0,1) and 
##   then making sure that FYcondX(FYinvcondX(R,X),X)=R
> Xtmp <- exp(rexp(100)); Rtmp <- runif(100)

> sum(abs(FYcondX(Xtmp, FYinvcondX(Rtmp,Xtmp))-Rtmp))
[1] 1.259409e-15    ### this is =0, OK
> sum(abs(FXcondY(FXinvcondY(Rtmp,Xtmp),Xtmp)-Rtmp))
[1] 5.972653e-15    ### OK again
> sum(abs(Fy(Finvy(Rtmp))-Rtmp))
[1] 2.664535e-15    ### OK again

METHOD 1. 
> Ytmp <- Finvy(runif(1000))
  Xtmp <- FXinvcondY(runif(1000),Ytmp)
### We display the histogram of log(Xtmp) versus 
###    the theoretical density exp(z)*fx(exp(z))
> hist(log(Xtmp), nclass=36, prob=T, main=
     "Scaled Histogram of log(X)") 
> lines((1:400)/100 -> lxp, 3*exp(-2*lxp)-1.5*exp(-3*lxp), 
     lty=3)
### Virtually perfect agreement
> hist(log(Ytmp), nclass=36, prob=T, main=
     "Scaled Histogram of log(Y)") 
  lines((1:400)/100 -> lxp, 1.5*exp(-lxp)-exp(-2*lxp), 
     lty=3)   ### Again very close agreement

METHOD 2.
> xpt <- exp((0:400)/80)
> Xtmp2 <- predict.smooth.spline(smooth.spline(Fx(xpt), xpt,
      spar=1.e-6), runif(1000))$y
  Ytmp2 <- FYinvcondX(runif(1000), Xtmp2)

(b) Here there are simple formulas: X = R*cos(Th), Y = R*sin(Th)
where R ~ F_R(r) = (2/pi)*arc tan(r^2) and Th ~ Unif(0,2*pi)
are independent.

> Thvec <- 2*pi*runif(1000)
  Rvec <- sqrt(tan(pi*runif(1000)/2))
  Xvec <- Rvec*cos(Thvec)
  Yvec <- Rvec*sin(Thvec)

(9) (a) The region of interest is the set in the place with 
  x-coordinates ranging from -5 to 5, and with y ranging from
  0 to min((10 +2x)/3, 5 - x). It is easy to see that the density
  of X is not uniform but rather is proportional to the height 
  (max y-coordinate) of the triangle for each value x in [-5,5].
  The constant of proportionality is the reciprocal of the 
  area of the triangle, ie fX(x) = .05*min((10+2x)/3, 5 - x).
  The corresponding df is FX(x) = (.05/3)*(x+5)^2 for x<1,
  and = 1 - .025*(5-x)^2  for 1<= x < 5.
  Thus:

> XYfcn <- function(N) {
    Xval <- numeric(N)
    U1val <- runif(N)
    ind1 <- U1val <= .6
    Xval[ind1] <- sqrt(60*U1val[ind1])-5
    Xval[!ind1] <- 5-sqrt(40*(1-U1val[!ind1])) 
    Yval <- pmin(10/3 + (2/3)*Xval, 5 - Xval)*runif(N)
    list(Xval=Xval, Yval=Yval)
   }

An alternate method here is to generate distributions in the 
rectangle [-5,5] x [0,4], and reflect observations falling 
outside the triangle across the lines y=10/3 + (2/3)*x 
if x<1 and across y=5-x if x>1, to obtain points in the tirangle in
every case:

> XYfcn2 <- function(N) {
    Xmat <- cbind(-5 + 10*runif(N), 4*runif(N))
    inds1 <- Xmat[,1] < 1 & Xmat[,2] > 10/3 + (2/3)*Xmat[,1]
    Xmat[inds1,] <- outer(rep(2,sum(inds1)),
        c(-2,2),"*") - Xmat[inds1,]
    inds2 <- Xmat[,1] >= 1 & Xmat[,2] > 5-Xmat[,1]
    Xmat[inds2,] <- outer(rep(2,sum(inds2)),
        c(3,2),"*") - Xmat[inds2,]
    list(Xval=Xmat[,1], Yval = Xmat[,2]) }
  tmp2 <- XYfcn2(1000)
      
When plotted, both of the two 1000-vectors look uniform 
over the triangle. For a more precise test, we do a 
chi-square with cells defined by x-values in intervals 
[-5,-3], [-3,-1], [-1,1], [1,3], [3,5] and with y-values 
in intervals [0,2] and [2,4].
> t1 <- table((tmp$Xval+1) %/% 2 , (tmp$Yval %/% 2))
> t1
     0   1 
-2  58   0
-1 180  18
 0 196 129
 1 218 110
 2  91   0
> t2 <- table((tmp2$Xval+1) %/% 2 , (tmp2$Yval %/% 2))
 t2
     0   1 
-2  63   0
-1 181  14
 0 184 146
 1 195 106
 2 111   0
** The theoretical areas of the intersections of the desired triangle
with the rectangular cells, expressed as a fraction of the area 20 of
the whole triangle, are as follows:
> ea <- round(.05*matrix(c(2*.5*(4/3),0,2*(.5*(5/3)+.5*(6/3)), 1/3,
                     4, 8/3, 4, 2, 2, 0), byrow=T, ncol=2)*1000,2)
> ea
       [,1]   [,2] 
[1,]  66.67   0.00
[2,] 183.33  16.67
[3,] 200.00 133.33
[4,] 200.00 100.00
[5,] 100.00   0.00

### Now for the 2 chi-square statistics on 5 df :
> c(sum(((c(t1)-c(ea))^2/c(ea))[c(ea)>0]), 
       sum(((c(t2)-c(ea))^2/c(ea))[c(ea)>0]))
[1] 4.944696 4.838281   #### OK !!!

(b) The rejection method is easier to program. Note that since we
should reject about half of the random numbers, to get 1000 pairs it
should suffice to generate 2200. (To check this, note that
> pbinom(2000,2200,.5)  ###  = 1
More generally, we propose to generate 400 + 2.2*N pairs.

> XYfcn3 <- function(N) {
    naux <- ceiling(400+2.2*N)
    Xmat <- cbind(-5+10*runif(naux), 4*runif(naux))
    inds <- (1:naux)[(Xmat[,1] < 1 & Xmat[,2] < 10/3+ (2/3)*Xmat[,1]) |
         (Xmat[,1] >= 1 & Xmat[,2] < 5-Xmat[,1])]
    list(Xval=Xmat[inds[1:N],1], Yval=Xmat[inds[1:N],2])
  }
  tmp3 <- XYfcn3(1000)

> sum(((c(table((tmp3$Xval+1) %/% 2 , (tmp3$Yval %/% 2))) -
       c(ea))^2/c(ea))[c(ea)>0]) ### now chisq = 5.727582


(10) Here we do not have the luxury of analytically known Finv
    or even F . First evaluate numerical integral via 
    Simpson's rule, then invert using the smoothing-spline trick.

### First tabulate cdf at points  k/10, k=0:100
> xpt <- (0:200)/10
  fpt <- 0.3851565*log(1+xpt)*xpt^2*exp(-xpt)
  fpt[1] <- 0
  Fpt <- pmin((6/60)*cumsum(fpt)[seq(1,201,2)],1)
  xpt <- xpt[seq(1,201,2)]
  tspl <- smooth.spline(Fpt,xpt, spar=1.e-6)
### Now with this pre-processing done: to generate 1000 desired
###    points:
> Xvec <- predict.smooth.spline(tspl, runif(1000))$y
### density looks bimodal!! But  there is a unique root for its
### derivative,  near 2.
> summary(Xvec)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2057  2.0470  3.0320  3.3280  4.2550 11.9500 
> hist(Xvec,nclass=36, prob=T)
> lines((0:200)/10, fpt)  ### very nice fit !!

## To solve part (a) as requested, in a function, can take
> ftmp1 <- function(uvec) predict.smooth.spline(tspl, uvec)$y

> Xvec2 <- predict.smooth.spline(tspl, runif(1000))$y
> summary(Xvec2)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2039  2.1330  3.1960  3.5330  4.5120 12.9600 
### looks very much the same
### For both cases, let's check that we get the right relative
###   frequency on (3.5,7):


> integrate(function(x) 0.3851565*log(1+x)*x^2*exp(-x), 
         1.e-6,7)$integral ###   =    0.9493278
> integrate(function(x) 0.3851565*log(1+x)*x^2*exp(-x), 
         1.e-6,3.5)$ value integral ###   =  0.5621234
> 0.9493278-0.5621234
[1] 0.3872044
> c(sum(Xvec < 7 & Xvec > 3.5), sum(Xvec2 < 7 & Xvec2 > 3.5))/1000
[1] [1] 0.355 0.399  ### close enough to the theoretical value 0.387

### For accept/reject method, we must find a density g(x) from 
    which we know how to simulate, such that  f(x) <= k*g(x)  
    for a not-too-large constant k. The simplest way is to use
    the inequality  log(1+x) < x  to take  g(x)  proportional 
    to x^3*exp(-x), i.e.,  Gamma(4,1).
### So g(x) = (1/6)*x^3*exp(-x), and f(x) <= 6c g(x), and 
    k=6*c = 2.310939.

> XsimAcc <- function(N, fac=4) {
### Simulate fac*N + 300 variates
    naux <- ceiling(fac*N+300)
    Xaux <- rgamma(naux,4)
    Uaux <- runif(naux)*2.310939
    Xaux[(1:naux)[Uaux < 2.310939*log(1+Xaux)/Xaux]][1:N]
   }
> Xvec3 <- XsimAcc(1000, 5)    
  summary(Xvec3)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.3199  2.2220  3.1540  3.4660  4.4400 13.3500 
> integrate(function(x) 0.3851565*log(1+x)*x^2*exp(-x), 
         1.e-6,2)$integral
[1] 0.2051833 
> integrate(function(x) 0.2600396*(1+log(x))*x^2*exp(-x), 
         1.e-6,4)$integral 
[1]  0.661278
> integrate(function(x) 0.2600396*(1+log(x))*x^2*exp(-x), 
         1.e-6,8)$integral 
[1]  0.9754157
### So let's do chisquare (with 3df) on all three generated
##    X-vectors. 
> evec <- diff(c(0,0.2051833 , 0.661278, 0.9754157, 1))*1000
> evec
[1]  205.1833 456.0947 314.1377  24.5843
> sum((hist(Xvec, breaks=c(0,2,4,8,20), plot=F)$counts-evec)^2/evec)
[1] 9.732789   ### NB p-value =  0.0210
> sum((hist(Xvec2, breaks=c(0,2,4,8,20), plot=F)$counts-evec)^2/evec)
[1] 5.10709
> sum((hist(Xvec3, breaks=c(0,2,4,8,20), plot=F)$counts-evec)^2/evec)
[1]  4.73794
> hist(Xvec3, breaks=c(0,2,4,8,20), plot=F)$counts
[1]  191 487 303  19

### For the timing of the methods, let's generate 1e6 variates:
> unix.time(xtmp <- predict.smooth.spline(tspl, runif(1e6))$y)
[1] 3.07 0.39 5.75 0.00 0.00    
> unix.time(xtmp2 <- XsimAcc(1e6, 2.5))

   ### This should be a large enough factor for such a large 
   ###   number of variates: 
   ###   by LLN, any factor > k =2.310939 should work!
[1] 10.35  0.97 27.61  0.00  0.00

### The accept-reject scheme is much less efficient, both because of
    the many extra random numbers being generated and because of the
    mathematical operations like log's.(The predict.spline.smooth
    evaluation is just linear algebra. NOTE that the elapsed-time
    value (the third of the times produced) is not a good measure of
    efficiency when you are using a shared-resource network. But the
    CPU time IS.


(11) gmix <- function(N) {
       xout <- numeric(N)
       eps <- rbinom(N,1,.3)
       naux <- sum(eps)
       xout[eps==1] <- rnorm(naux)
       xout[eps==0] <- rlogis(N-naux)*1.5
       xout
     }

> xtmp <- gmix(1000)
> summary(xtmp)
      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 -13.25289  -1.15870   0.00146   0.01452   1.23705  14.89885
> diff(c(0, 0.3*pnorm(c(-5,-1,.5,4))+.7*plogis(c(-5,-1,.5,4)/1.5),1))
[1] 0.02411172 0.26095540 0.33017076 0.33927420 0.04548792
  expec <- 1000*.Last.value
  obs <- hist(xtmp, breaks=c(-15,-5,-1,.5,4,20), prob=F, plot=F)$count
> sum((obs-expec)^2/expec)
[1] 2.217372	### Very moderate for chi-square 5df !!

(b) First generate triple  mm  of counts of observations falling 
intorespective intervals (-Inf, -1], (-1,1], (1,Inf) (with 
respective probabilities .2/e = .0736, 1-.5/e = .8161, .3/e = .1104.
Then simulate within these intervals from corresponding densities
using exponentials for extreme intervals and rejection method using
normals for middle interval. NB. We can actually use the rejection
method for the middle interval to choose those observations falling
in the tails, as follows, using the facts
> c(qnorm(.2/exp(1)), qnorm(1-.3/exp(1)))
[1] -1.449666  1.224595

> ThrSim <- function(N, fac=2) {
     rnvec <- rnorm(N)
     mm1 <-  sum(rnvec < -1.449666)
     mm3 <-  sum(rnvec > 1.224595)
     ind <- (1:N)[rnvec >= -1.449666 & rnvec <= 1.224595]
     rnaux <- c(rnvec[ind], rnorm(200+sum(ind)*(fac-1)))
     outvec <- numeric(N)
     outvec[rnvec < -1.449666] <- -1-rexp(mm1)
     outvec[rnvec > 1.224595] <- 1+rexp(mm3)
     outvec[ind] <- rnaux[abs(rnaux)<1][1:(N-mm1-mm3)]
     list(outvec = outvec, mcounts =c(mm1,N-mm1-mm3,mm3), normVal=ind)
   }
> dout <- ThrSim(1000)
> dout$mcounts/1000
[1] 0.078 0.818 0.104  ### These numbers match well
> chisq.gof(dout$outvec[dout$outvec> 1]-1, dist="exponential",
    rate=1/mean(dout$outvec[dout$outvec> 1]-1), n.param.est=1)
        Chi-square Goodness of Fit Test

data:  dout$outvec[dout$outvec > 1] - 1 
Chi-square = 6.25, df = 11, p-value = 0.8562   ### OK !!!
> chisq.gof(-dout$outvec[dout$outvec < -1]-1, dist="exponential",
    rate=1/mean(-dout$outvec[dout$outvec < -1]-1), n.param.est=1)
        Chi-square Goodness of Fit Test

data:   - dout$outvec[dout$outvec < -1] - 1 
Chi-square = 11.2308, df = 10, p-value = 0.3398 ### OK.

(12) There are a few different ways to do this. The ones I will cover
     are three: (a) multivariate probability integral transform
     method (very messy!), (b) same method applied to affinely
     transformed data (very simple!), (c) naive accept-reject
     (slightly wasteful but very simple).

METHOD 0. This is a really ugly method, when applied to the 
untransformed problem. As an indication, find the marginal density 
     f_Y(y) = c* I[0<y<1] * ( (1-y)* min(y,1.5-y) + 0.5*I[y< .75] *
                              ((1-y)^2 - max(0,y-0.5)^2) )
from which we can find 1/c = Volume of allowed 3-dim region=

> integrate(function(y) (1-y)* pmin(y,1.5-y),0,1)$integral +
  integrate(function(y) 0.5*((1-y)^2 - pmax(0,y-0.5)^2), 0,.75)$integral
[1] 0.3229167
This agrees to seven decimal places with the value  31/96  calculated
below, under method 1.

METHOD 1. We know how to do change-of-variable based on affine
transformations. Here let u=x, v=x+y, w=x+z. Then the mapping
(x,y,z) -> (u,v,w) is evidently linear and invertible, with Jacobian
determinant 1: so the inverse preserves volumes, and the
region of interest in (u,v,w) coordinates is 
(v,w) in (0,1)^2, max(0.5*(v+w)-0.75, 0) < u < min(v,w)
However, we must still do the necessary integrations if we want 
to use the (trivariate) probability integral transform method,
yielding 

f_{V,W}(v,w) = c * (min(v,w) - max(0, 0.5*(v+w) - 0.75)), 

for v, w in (0,1); and then 

f_V(v) = c*(v - 0.5*v^2 - 0.25*(max(0,v-0.5))^2);  so that

F_V(v) = c*( 0.5*v^2 - v^3/6 - (1/12)*(max(0,v-0.5))^3)

## Setting v=1, F_V(1)=1, gives the value c=96/31 mentioned above.
## NOTE that direct numerical inversion of this distribution function
## is not feasible in parallel fashion, so we resort to tabulation,
## spline-interpolation of the inverse, etc.

F_{W|V}(w|v) = (c/F_V(v))*(0.5*min(v,w)^2+ v*max(w-v,0) - 
                 0.25*I[v>0.5]*(v+w-1.5)^2)

> fv <- function(v) (96/31)*(v - 0.5*v^2 - 0.25*(pmax(0,v-0.5))^2)
  FV <- function(v) (96/31)*( 0.5*v^2 - v^3/6 -
                (1/12)*(pmax(0,v-0.5))^3)
  FWcondV <- function(w,v) (0.5*pmin(v,w)^2 + v*pmax(w-v,0) - 0.25*
          pmax(0,v+w-1.5)^2)/(v - 0.5*v^2 - 0.25*(pmax(0,v-0.5))^2)

#### OK, now we can use the tabulate/interpolate idea for V, and 
     explicit inversion for FWcondV:

> FWinvV <- function(r,v) {
     fout <- numeric(length(r)) ## assumes r, v have same length
     vsm <- (v < 0.5)
     wltv <- (r < FWcondV(v,v))
     fout[vsm & wltv] <- { vaux <- v[vsm & wltv]
           sqrt(2*r[vsm & wltv]*vaux*(1 - 0.5*vaux)) }
     fout[vsm & !wltv] <- { vaux <- v[vsm & !wltv] 
           r[vsm & !wltv]*(1 - 0.5*vaux) + vaux/2 }
     vwsumg <- (r >= FWcondV(pmin(1.5-v,1),v))
     if(sum(!vsm)) {
     auxb <- (!vsm & !vwsumg)
     cc <- 96/31
     if(sum(auxb)) {
     fout[auxb & wltv] <- { vaux <- v[auxb & wltv]
         sqrt(2*r[auxb & wltv]*fv(vaux)/cc) }
     fout[auxb & !wltv] <- { vaux <- v[auxb & !wltv]
         r[auxb & !wltv]*fv(vaux)/(cc*vaux) + vaux/2 }
     }
     auxb <- (!vsm & vwsumg)
     if(sum(auxb)) {
     fout[auxb & wltv] <- { vaux <- v[auxb & wltv]
         sqrt(4*r[auxb & wltv]*fv(vaux)/cc+2*(vaux-1.5)^2) + vaux-1.5 }
     fout[auxb & !wltv] <- { vaux <- v[auxb & !wltv]
         vaux+1.5 - sqrt(vaux*(6-2*vaux)-4*r[auxb & !wltv]*fv(vaux)/cc)}
       }
     }
     fout
     }

> vt <- runif(1000)
  wt <- runif(1000)
  rt <- FWcondV(wt,vt)
  summary(FWinvV(rt,vt)-wt)
      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-2.220e-16  0.000e+00  0.000e+00 -1.629e-18  0.000e+00  2.220e-16 
 
### Up to here, we have tested that FWinvV correctly inverts FWcondV

> M1 <- function(N, spacng=.01) {
  spts <- (0:100)/100
  Vspl <- smooth.spline(FV(spts),spts, spar=1.e-6)
  Vsim <- predict.smooth.spline(Vspl,runif(N))$y
  Wsim <- FWinvV(runif(N),Vsim)
  Usim <- runif(N, pmax((Vsim+Wsim)/2-.75,0), pmin(Vsim,Wsim))
  cbind(x=Usim, y=Vsim-Usim, z=Wsim-Usim)
  } 

METHOD 2. Now we have only to generate a series of X,Y,Z values
directly in the cube (0,1)^3 and reject those that fall outside the
region.
> M2 <- function(N, fac=1/0.321) {
  naux <- 200 + N*fac
  xyzmat <- matrix(runif(3*naux), ncol=3, 
        dimnames=list(NULL, c("x","y","z")))
  bool <- xyzmat[,1]+xyzmat[,2] < 1 & xyzmat[,1]+xyzmat[,3] < 1 &
        xyzmat[,2]+xyzmat[,3] < 1.5
  list(xyzmat = xyzmat[bool,][1:N,], volum=mean(bool))
  }
> M2(1000)[[2]]
[1] 0.3309201
> M2(1.e6)[[2]]
[1] 0.3226401	### took 10 minutes on Sparc 5 !?

### OK, now generate two different matrices 1000 x 3:

> mat1 <- matrix(unlist(M1(1000)), ncol=3)
  mat2 <- M2(1000)  ### recall this is a list by def'n
> mat2$volum
[1]  0.3257919      ### OK, still agrees with volumes above
> mat2 <- mat2$xyzmat
## For chi-square use the cells based on x,y,z < .5 or >= .5
## NB the cells with x,y>.5 or  x,z > .5  are empty (4 in all)

> t1 <- table( mat1[,1] >= .5, mat1[,2] >= .5, mat1[,3] >= .5)
  t2 <- table( mat2[,1] >= .5, mat2[,2] >= .5, mat2[,3] >= .5)
> t1
, ,  = FALSE       
        FALSE TRUE
  FALSE 401   186 
  TRUE  122     0 

, ,  = TRUE       
        FALSE TRUE
  FALSE 190   101 
  TRUE    0     0 

> t2
, ,  = FALSE       
        FALSE TRUE
  FALSE 386   177 
  TRUE  126     0 

, ,  = TRUE       
        FALSE TRUE
  FALSE 199   112 
  TRUE    0     0 

> sum(t1+t2>0)
[1] 5  ### so the chi-square will have   4 df
> sum(((t1-t2)^2/(t1+t2))[t1+t2>0])
[1] 1.349854	#### Convincingly small !!!

Method 3. Suppose we had wanted only to test the validity of M2
without working so hard to establish the correct marginal and 
onditional densities. It would have sufficed to create a rejection-
method simulation which worked differently from (and so might 
be expected to have errors, if any, different from) M2.

For example, let's use the same transformation as in Method 1;
create (V,W) by a reject-accept method using the known density f_{V,W};
and then generate U conditionally uniform in  
[max(0.5*(v+w)-0.75, 0), min(v,w)].

> M3 <- function(N, fac=5) {
  naux <- 200 + N*fac
  vwrmat <- matrix(runif(2*naux), ncol=2)
  vwrmat <- vwrmat[runif(naux) < pmin(vwrmat[,1],
     vwrmat[,2]) -pmax(0,0.5*(vwrmat[,1]+vwrmat[,2])-0.75),][1:N,]
  uvec <- runif(N, pmax(0,0.5*(vwrmat[,1]+vwrmat[,2])-0.75), 
     pmin(vwrmat[,1], vwrmat[,2]))
  xyzmat = cbind(x=uvec, y=vwrmat[,1]-uvec, z=vwrmat[,2]-uvec)
  }
> mat3 <- M3(1000)
> t3 <- table(mat3[,1]>0.5, mat3[,2]>0.5, mat3[,3]>0.5)
> t3
, ,  = FALSE       
        FALSE TRUE
  FALSE 363   215 
  TRUE  125     0 

, ,  = TRUE       
        FALSE TRUE
  FALSE 204    93 
  TRUE    0     0 
> sum(((t3-t2)^2/(t3+t2))[t3+t2>0])
[1] 6.216943	### still OK for 4 df chisq
> 1-pchisq(6.217,4)  ### pvalue = 0.184


(13)
> Trapez <- function(gfcn, a, b, m) {
    hh <- (b-a)/m
    gvec <- gfcn(seq(a,b, by=hh))
    cumsum(c(0, gvec[1:m]+gvec[2:(m+1)]))*(hh/2)
    }

  SimpRul <- function(gfcn, a, b, m) {
    hh <- (b-a)/(2*m)
    gvec <- gfcn(seq(a,b, by=hh))
    cumsum(c(0,gvec[seq(1,2*m-1, by=2)]+4*gvec[(1:m)*2] + 
         gvec[seq(3,2*m+1, by=2)]))*(hh/3)
    }
   (cumsum(c(0, gvec[1:(2*m-1)]+4*gvec[2:(2*m)] + 
         gvec[3:(2*m+1)]))*(hh/3))[seq(1,2*m+1, by=2)]
> summary(pgamma(seq(0,7,by=1/10), shape=2.5) - 
     Trapez(function(x) dgamma(x, shape=2.5), 0, 7, 70))
          Min.       1st Qu.        Median          Mean       3rd Qu. 
 -0.0002429525  0.0000719017  0.0000917735  0.0000611028  0.0001241061

          Max. 
  0.0001456788

> tmp <- SimpRul(function(x) dgamma(x, shape=2.5), 0, 7, 70)
  summary(pgamma(seq(0,7,by=1/10), shape=2.5) - tmp)
          Min.       1st Qu.        Median          Mean       3rd Qu. 
 -6.350734e-06 -6.343742e-06 -6.340412e-06 -6.243459e-06 -6.340257e-06

          Max. 
  0.000000e+00


(14) > NRaph <- 
function(x0, b, gfcn, gprimfcn, tol = 1e-06, maxit = 15)
{
        xout <- x0
        bmat <- matrix(b, ncol = if(is.null(ncol(b))) 1 else ncol(b))
        g.old <- gfcn(x0, bmat)
        ind <- 1:length(x0)
        hlist <- NULL 
        ctr <- 0
        while(ctr < maxit & length(ind)) {
                xout[ind] <- xout[ind] - g.old/
		        gprimfcn(xout[ind], bmat[ind,  ])
                ctr <- ctr + 1
		ind <- ind[abs(g.old) > tol]
                g.old <- gfcn(xout[ind], bmat[ind,  ])
      ### hlist is a list withindices used in successive iterations
		hlist <- c(hlist, list(ind))
        }
        list(xout=xout, hlist = hlist)
}

> g <- function(x, b)  4 * x^3 - 3 * x^2 - b
  gprim <- function(x,b) 12*x^2 - 6*x


> tmpb <- runif(20)
  tmpx <- NRaph(rep(1,20), tmpb)$xout
### cannot use initial x0 = 0 or 0.5, or else gprim=0
> round(tmpx,5)
 [1] 0.77220 0.95311 0.79953 0.96977 0.90097 0.91101 0.88998 0.82857 0.90248
[10] 0.87705 0.90143 0.94297 0.88818 0.84548 0.95940 0.80978 0.90905 0.96719
[19] 0.77900 0.79330

> for (i in 1:20) cat(round(c(tmpb[i], optimize(g,c(-10,10),
       b=tmpb[i])$min),5), "\n")
0.05295 0.77221 
0.73802 0.95311 
0.12665 0.79952 
0.82675 0.96977 
0.49021 0.90097 
0.53452 0.91102 
0.4435 0.88998 
0.21575 0.82856 
0.49676 0.90248 
0.39091 0.87704 
0.49218 0.90143 
0.68635 0.94297 
0.43601 0.88818 
0.273 0.84547 
0.77099 0.95941 
0.15679 0.80977 
0.52573 0.90906 
0.81266 0.96719 
0.07039 0.779 
0.109 0.79332 

(15) Simulate using b0=3.6, 

> nvec <- c(100,50, 120,80)
  pvec <- 0.1*log(1+3.6*(1:4))
  mvec <- rbinom(4,nvec,pvec)
> cbind(nvec=nvec, pvec=pvec, xv=1:4, mvec=mvec, 
      Emvec=round(nvec*pvec,2))
     nvec      pvec xv mvec Emvec 
[1,]  100 0.1526056  1   19 15.26
[2,]   50 0.2104134  2   13 10.52
[3,]  120 0.2468100  3   30 29.62
[4,]   80 0.2734368  4   24 21.87

### (exp(10*mvec[3]/nvec[3])-1)/3 
[1] 3.727498   ### will use this as starting value
    ### Other possible choices would have been:
> (exp(10*mvec/nvec)-1)/(1:4)
[1] 5.685894 6.231869 3.727498 4.771384
### NB all are greater than the true value 3.6,
###  because it just happens that all observed 
###  numbers of successes are larger than their 
###  corresponding expected numbers.

> lLikHW15 <- function(b) {
    pvb <- .1*log(1+b*(1:4))  ## vector
    sum(mvec*log(pvb)+(nvec-mvec)*log(1-pvb)) }
## Note that this function does not make sense for
##  b greater than  (exp(10)-1)/4 = 5506.366

> optimize(lLikHW15, c(0,5000), max=T)[c(1,2,5)]$maximum:
[1] 4.820766

$objective:
[1] -193.9393

$message:
[1] "normal termination"

> c(lLikHW15(4.8208), lLikHW15(3.6))
[1] -193.9393 -194.6267

### The likelihood is greater at the MLE of 4.82
###   than at the true value, but not MUCH greater.

> NRaph2 <- function(x0, gfcn, gprimfcn, tol = 1e-06, maxit = 15)
{
        xout <- x0
        g.old <- gfcn(x0)
        ctr <- 0
        while(ctr < maxit & abs(g.old) > tol) {
                xout <- xout - g.old/gprimfcn(xout)
                ctr <- ctr + 1
                g.old <- gfcn(xout)
        }
        list(xout=xout, niter=ctr, logLik=lLikHW15(xout),
          Hess=gprimfcn(xout))
}

> tmpo <- NRaph2(3.7275, function(b) {
         pv <- .1*log(1+b*(1:4))
         sum((0.1*(1:4)/(1+b*(1:4)))*(mvec-nvec*pv)/(pv*(1-pv)))},
      function(b) {
         pv <- .1*log(1+b*(1:4))
         -0.1* sum(((1:4)/(1+b*(1:4)))^2*( (mvec-nvec*pv)/
             (pv*(1-pv))+0.1*(mvec/pv^2+(nvec-mvec)/(1-pv)^2)))})
$xout:
[1] 4.820771

$niter:
[1] 4

$logLik:
[1] -193.9393

$Hess:
[1] -0.6785414

> xt <- seq(2,7,by=.05)
  llk <- numeric(101)
  for(i in 1:101) llk[i] <- lLikHW15(xt[i])
  plot(xt,llk)
  abline(h=tmpo$logLik - 0.5*qnorm(.975)^2, lty=3)
### This line shows that all values b within (2.9,8)
###   would lie within the "confidence interval"

## Another way to view this is through the Wald CI 
> tmpo$xout - c(-1,1)*qnorm(.975) / tmpo$Hess
[1] 1.932275 7.709266
### We can see that this "interval" is skewed wrt
###  LRT test-based region because the logLik function
###  is asymmetric near the max !!

### Chi-square goodness of fit:
> pvopt <- .1*log(1+tmpo$xout*(1:4))
  sum((mvec-nvec*pvopt)^2/(nvec*pvopt*(1-pvopt)))
[1] 0.6289064   ### compare with chisq 3df 


(16) SIMULATION of data:
## Start with correlated normal observations.

> xvec <- rnorm(1000)*0.4
  dmat <- cbind(rep(1,1000), xvec, -3*xvec+rnorm(1000)*0.7)
> apply(dmat,2,summary)
                  xvec             
   Min. 1 -1.156850840 -4.83032127
1st Qu. 1 -0.259528092 -0.97472353
 Median 1  0.022094641 -0.09772142
   Mean 1  0.007300602 -0.05061261
3rd Qu. 1  0.271476445  0.84269223
   Max. 1  1.194744110  4.82130422

> b1vec <- c(-1.3, .5, .25)
  scors <- c(dmat %*% b1vec)
  summary(exp(scors))
      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 0.1480496 0.2360536 0.2704901 0.2757545 0.3074749 0.5199690

> ydat <- rbinom(1000,1,exp(scors))

> outmat <- cbind(round(dmat,6), ydat)
  write(t(outmat),file="/home1/evs/public_html/s798c/Data/Dset3D.asc",
     ncolumns=4)

 Analysis of "outmat" imported from Data/Dset3D.asc
---------------------------------------------------

### True beta is c(-1.3, .5, .25)

> dfram <- data.frame(outmat[,2:4])
  names(dfram) <- c("x1","x2","y")
> glmtmp <- glm(cbind(y,1-y) ~ . , family=binomial, 
      data=dfram)
> glmtmp
..
Coefficients:
 (Intercept)        x1       x2 
   -0.967169 0.3817471 0.307846

Degrees of Freedom: 1000 Total; 997 Residual
Residual Deviance: 1161.625 

> c(lLikC(b1vec), lLikC(glmtmp$coef))
[1] -593.2391 -580.8126
### NOTE that the logLik difference is much larger than would be
###    expected (one half of a chi-square 3df rv) if the model
###    were properly specified !!!

> tmpmax <- nlmin(function(b) -lLikC(b), rep(0,3))
> tmpmax
$x:
[1] -0.9671691  0.3817473  0.3078461

$converged:
[1] T

$conv.type:
[1] "relative function convergence"

==============================================================
### Goodness of fit 

CI's 
> rbind(b1vec, fitted=glmtmp$coef, s.dev=summary(glmtmp)$coef[,2])
       (Intercept)        x1         x2 
 b1vec -1.30000000 0.5000000 0.25000000
fitted -0.96716901 0.3817471 0.3078460
 s.dev  0.07163044 0.3490234 0.1012989

> rbind(ulim = glmtmp$coef + 1.96*summary(glmtmp)$coef[,2],
 llim = glmtmp$coef - 1.96*summary(glmtmp)$coef[,2])
     (Intercept)         x1        x2 
ulim  -0.8267733  1.0658331 0.5063918
llim  -1.1075647 -0.3023388 0.1093002

These give CI's based on "properly specified" Iobs-based analysis.
The comparison below with Jobs shows that this is reasonable.

Other, test-based, CI's: for this we need to re-calculate
log-likelihoods with maximized values substituted for all 
coordinates other than the one of interest. 

### Begin with intercept:
> likAlt1 <- function(b0) {
    auxmin <- nlminb(glmtmp$coef[2:3], function(b2dim,B0) 
          -lLikC(c(B0,b2dim)), B0=b0)
    c(rMLE = auxmin$param, lLik= -auxmin$obj) }
> likAlt1(glmtmp$coef[1])
   rMLE.x1  rMLE.x2      lLik 
 0.3817471 0.307846 -580.8126
> likAlt1(-.7)
  rMLE.x1   rMLE.x2      lLik 
 0.375994 0.2926115 -588.0464
> likAlt1(-1.2)
   rMLE.x1   rMLE.x2      lLik 
 0.3964614 0.3298038 -585.8997
>  xt <- seq(-1.3,-.7, length=100)
   yt <- numeric(100)
   for(i in 1:100) yt[i] <- likAlt1(xt[i])[3]
   plot(xt, yt)
   abline(h=-580.813-1.92)  ## -582.733 threshold
   ### graphically read off CI for intercept: (-1.11, -0.83)

> uniroot(function(b) likAlt1(b)[3]+582.733, 
       c(-1.2,glmtmp$coef[1]))$root
[1] -1.10913
> uniroot(function(b) likAlt1(b)[3]-(-580.813-1.92), 
       c(glmtmp$coef[1],-0.7))$root
[1] -0.8282462
### This reproduces the Wald interval above for Intercept 

*** Now for x1 interval by test-based method:
> likAlt2 <- function(b1) {
    auxmin <- nlminb(glmtmp$coef[c(1,3)], function(b2dim,B1) 
          -lLikC(c(b2dim[1],B1,b2dim[2])), B1=b1)
    c(rMLE = auxmin$param, lLik= -auxmin$obj) }
> c(uniroot(function(b) likAlt2(b)[3]+582.733, 
       c(-.6,glmtmp$coef[2]))$root, uniroot(function(b) 
    likAlt2(b)[3]+582.733, c(glmtmp$coef[2],1.5))$root)
[1] -0.3019839  1.0673748	    ### OK, very close !!

*** Now for x2 interval by test-based method:
> likAlt3 <- function(b2) {
    auxmin <- nlminb(glmtmp$coef[1:2], function(b2dim,B2) 
          -lLikC(c(b2dim[1:2],B2)), B2=b2)
    c(rMLE = auxmin$param, lLik= -auxmin$obj) }
> c(uniroot(function(b) likAlt3(b)[3]+582.733, 
       c(-.2,glmtmp$coef[3]))$root, uniroot(function(b) 
    likAlt3(b)[3]+582.733, c(glmtmp$coef[3],.8))$root)
[1] 0.1102483 0.5076699       ### OK, close again !!


(I) Comparison of 2 ways to calculate Iobs
### The first is the one inverted above:
> Iobs1 <- t(dmat) %*% (dlogis(c(dmat %*% glmtmp$coef))*dmat)
> round(Iobs1,4)
                   xvec           
      196.5368  -5.8305   24.7926
xvec   -5.8305  31.7733  -94.4104
       24.7926 -94.4104  378.2660

> Iobs2 <- t(dmat) %*% ((dfram[,3] - plogis(c(dmat %*% 
      glmtmp$coef)))^2*dmat)
  round(Iobs2,4)
                   xvec           
      196.4871  -6.4059   25.1193
xvec   -6.4059  31.2392  -93.8838
       25.1193 -93.8838  382.9509


(II) Goodness of fit statistics:

> sum( (dfram[,3] - plogis(c(dmat %*% glmtmp$coef)))^2 )/
  sum( dlogis(c(dmat %*% glmtmp$coef)) ) 
[1] 0.9997468          ### Ratio of variances SHOULD be 1 !

> sum( (dfram[,3] - plogis(c(dmat %*% glmtmp$coef)))^2 /
     dlogis(c(dmat %*% glmtmp$coef)) ) 
[1] 1000.947   ### Fine for chi-squared 300 df!

> mtabl <- table(dfram[,1]>0, dfram[,2] > 0, dfram[,3])
  mtabl
		### These are the observed counts
, , 0
      FALSE TRUE 
FALSE    61  267
 TRUE   349   46

, , 1
      FALSE TRUE 
FALSE    18  135
 TRUE   100   24

> etabl <- aggregate.data.frame(plogis(c(dmat %*% 
    glmtmp$coef)), by= list(dfram[,1]>0, dfram[,2] > 0), sum)
> etabl
  Group.1 Group.2         x 
1   FALSE   FALSE  19.26567
2    TRUE   FALSE 103.74966
3   FALSE    TRUE 131.91261
4    TRUE    TRUE  22.07206

> sum((etabl[,3]-mtabl[,,2])^2/etabl[,3])
[1] 0.4593281		### Very small chi-square value !!


(17) Log to Simulate and Begin Analysis of Thoroughly 
Misspecified Linear Model Dataset Dset17.asc for HW17.
=====================================================

The model: y_i ~  N(a + Xmat %*% b), c^2) 
is an ordinary linear regression model with scalar 
intercept a, error-variance c^2, and 3-dim regression 
coefficients b, altogether giving 5-dim unknown parameter.

> Xmat <- cbind(rnorm(350)*0.4+0.5, rnorm(350)*(-1.2), rexp(350)*0.6)
> dimnames(Xmat)[[2]] <- c("X1","X2","X3")
> Yvec <- -1.6 + Xmat[,1]^2 - 0.5*Xmat[,2] + 
      sqrt(Xmat[,3]) + rnorm(350)*.4
> Lfram <- data.frame(cbind(Xmat[,c(1,3)], Y=Yvec))
> write.table(Lfram,"/home1/evs/public_html/s798c/Data/Dset17.asc",
    sep="  ")
## NB transformed two variables and omitted the third !

> lm1 <- lm(formula= Y ~ . , data=Lfram)
Call:
lm(formula = Y ~ ., data = Lfram)

Coefficients:
 (Intercept)      X1        X3 
   -1.387107 1.06529 0.5260654

*** So for part (i), get this for the b0,b1,b2 coeff's and
***   the estimated variance parameter is
> summary(lm1)$sigma^2 
[1] 0.581183


*** For part (ii), we re-do the estimation using nlminb:
> lLikNR <- function(betsigsq) 
    -(nrow(Lfram)/2)*log(betsigsq[4]) - (0.5/betsigsq[4])*
    sum((-betsigsq[1] + c(data.matrix(Lfram) %*% 
    c(betsigsq[2:3],1)))^2)
> nlminb(c(rep(0,3),1), function(x4) -lLikNR(x4), lower=c(
    rep(-Inf,3),1e-5), upper=rep(Inf,4))$param
[1] -1.3871072 -1.0652906 -0.5260648  0.5762019   ## OK !!

## Recover cov.unscaled from  summary(lm1)$sigma^2 and
##    summary(lm1)$cov.unscaled

> I0 <- solve(summary(lm1)$cov.unscaled)
> round(I0,4)
            (Intercept)       X1       X3 
(Intercept)    350.0000 178.7370 231.0422
         X1    178.7370 151.4908 124.0789
         X3    231.0422 124.0789 293.3373

> Xm <- cbind(rep(1,350), data.matrix(Lfram[,1:2]))
  t(Xm) %*% Xm 
                  X1       X3 
   350.0000 178.7370 231.0422
X1 178.7370 151.4908 124.0789
X3 231.0422 124.0789 293.3373

> round(summary(lm1)$coef,3)
             Value Std. Error t value Pr(>|t|) 
(Intercept) -1.387      0.076 -18.353        0
         X1  1.065      0.098  10.820        0
         X3  0.526      0.064   8.171        0

> sqrt(diag(summary(lm1)$cov.unscaled))*summary(lm1)$sigma
[1] 0.07558005 0.09846005 0.06438342

*** So for part (iii), we use these estimates and SE's
***  to get Wald-type CI's:
> rbind(lower = lm1$coef-1.96*summary(lm1)$coef[,2],
        upper = lm1$coef+1.96*summary(lm1)$coef[,2])
      (Intercept)        X1        X3 
lower   -1.535244 0.8723081 0.3998739
upper   -1.238971 1.2582715 0.6522569

### Now proceed to compare Iobs=Iobs1 vss Jobs=Iobs2

> Iobs1 <- t(Xm) %*% Xm /summary(lm1)$sigma^2

### Full Iobs1 matrix also has sigma^2 row and column:
> Iobs1 <- cbind(rbind(Iobs1,rep(0,3)), c(0,0,0,
      175/summary(lm1)$sigma^4))
  Iobs1
                  X1       X3          
   602.2199 307.5399 397.5377   0.0000
X1 307.5399 260.6594 213.4936   0.0000
X3 397.5377 213.4936 504.7244   0.0000
     0.0000   0.0000   0.0000 518.0983

> tm <- c(Lfram$Y - Xm %*% lm1$coef)
  Iobs2 <- t((tm^2*Xm)) %*% Xm / summary(lm1)$sigma^4

> Iobs2
                  X1       X3 
   597.0580 274.7184 344.1382
X1 274.7184 278.5695 184.6571
X3 344.1382 184.6571 407.6772

### This is like our Jobs in class, but so far just the 
###    upper-left 3x3  block.

> baux <- c(t(tm*Xm) %*% (tm^2/summary(lm1)$sigma^2-1)/
    (2*summary(lm1)$sigma^4))
> Iobs2 <- cbind(rbind(Iobs2,baux), c(baux, sum((tm^2/
    summary(lm1)$sigma^2-1)^2/(4*summary(lm1)$sigma^4))))
> Iobs2
                     X1        X3           
     597.0580 274.71839 344.13823 125.75393
  X1 274.7184 278.56954 184.65713  31.72295
  X3 344.1382 184.65713 407.67716  32.08578
baux 125.7539  31.72295  32.08578 524.76716

## Recall the raw standard errors of coefficients:
> round(sqrt(diag(summary(lm1)$cov.unscaled))*
      summary(lm1)$sigma,5)
[1] 0.07558 0.09846 0.06438   
> sqrt(diag(solve(Iobs1)))
[1] 0.07558005 0.09846005 0.06438342 0.04393331

### Now we have the "robust sandwich" version:
> RS.SE <- round(sqrt(diag(solve(Iobs1) %*% Iobs2 
      %*% solve(Iobs1))),5)
[1] 0.08905 0.11936 0.05996 0.04422

### So the "robust" calculation makes a noticeable difference,
    although not a large one, even in the asymptotic standard
    error for  sigma^2.

*** Part (iv) Robust CI's for each of Intercept, X1, X2:

> rbind(lower=lm1$coef-1.96*RS.SE[1:3],
        upper=lm1$coef+1.96*RS.SE[1:3])
      (Intercept)        X1        X3 
lower   -1.561645 0.8313442 0.4085438
upper   -1.212569 1.2992354 0.6435870



(18) (i) Example as in class. begin by maximizing logLik directly.

> Tvec <- scan(file="Dset3B.dat")  ## after saving data into file
> lLikmix <- function(plam) 
    sum(log((1-plam[1])*plam[2]*exp(-(plam[2]-2)*Tvec)+2*plam[1]))
> nlminb(c(.5,2), function(pl) -lLikmix(pl), lower=c(1e-6,1e-5),
    upper=c(1-1e-6,1e2))[1:4]
$parameters:
[1] 0.3998813 0.3663359

$objective:
[1] -2133.742

$message:
[1] "RELATIVE FUNCTION CONVERGENCE"

$grad.norm:
[1] 0.0129577		### took 18 iterations ...

> plam <- .Last.value[[1]]

## Let's check this, roughly. The mixture density should
##  have mean  p/2 + (1-p)/lambda
> c(plam[1]/2 + (1-plam[1])/plam[2], mean(Tvec))
[1] 1.838106 1.839836          #### OK !!

(ii) EM algorithm:  (poor starting point .5, 2 as above)

> plamEM <- function(plam0) {
    probs <- 1/(1+(1/plam0[1]-1)*0.5*plam0[2]*exp(
               -(plam0[2]-2)*Tvec))
    mp <- mean(probs)
    c(mp,(1-mp)/mean(Tvec*(1-probs))) }

Next step: recursively calculate, and print results:
> plam0 <- c(.5,2)
  for(i in 1:100) {
     plam0 <- plamEM(plam0) 
     paux <- round(plam0,5)
     cat("iter",i,",p=", paux[1],",lam=",paux[2]," ")
     if(i %% 2 == 0) cat("\n") }
... 
iter 41 ,p= 0.39988 ,lam= 0.36634  iter 42 ,p= 0.39988 ,lam= 0.36634  
### Reach this value at iteration 41 and didn't change ...
>  lLikmix(c(.5,2))
[1] 693.1472
>  lLikmix(c(0.39988, 0.36634))
[1] 2133.742

### Let's confirm that the logLik never decreased:
> plamLik <- numeric(101)
  plam0 <- c(.5,2)
  plamLik[1] <- lLikmix(plam0)
  for(i in 2:101) {
     plam0 <- plamEM(plam0) 
     plamLik[i] <- lLikmix(plam0) }
  sum(diff(plamLik) < 0)
> min(diff(plamLik))
[1] -5.911716e-12


(19) Of course we could simulate a multinomial(100,(a,(1+2a)/6,
     (1-a)/3, 1/2-a)) for a=.1, by:

> table(sample(1:4,100,replace=T, prob=c(.1,.2,.3,.4)))
 1  2  3  4 
 5 17 27 51

## But that is not at all parallelizable, so to do it 10000
##   times would either involve doing a 10000x100 array or
##   using a for-loop !

Instead, we simulate n1 ~ Binom(100,.1), n2 ~ Binom(100-n1,.2/.9),
n3 ~ Binom(100-n1-n2, .3/.7), and n4=100-n1-n2-n3.

nArr <- array(0, dim=c(1e4,4))
nArr[,1] <- rbinom(1e4,100,.1)
nArr[,2] <- rbinom(1e4,100-nArr[,1],2/9)
nArr[,3] <- rbinom(1e4,100-nArr[,1]-nArr[,2],3/7)
nArr[,4] <- 100 - nArr[,1]-nArr[,2]-nArr[,3]

T1vec <- (nArr[,1]-rep(10,1e4))^2/10 +
         (nArr[,2]-rep(20,1e4))^2/20 +
         (nArr[,3]-rep(30,1e4))^2/30 +
	 (nArr[,4]-rep(40,1e4))^2/40
> summary(T1vec)
     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  0.00000  1.20833  2.38333  2.99121  4.12500 19.45833
> summary(pchisq(T1vec,3))
      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 0.0000000 0.2489937 0.5032553 0.4991340 0.7517211 0.9997802
### If chi-square with 3 df were correct distribution, these 
###  values would be: 0, .25, .5, .5, .75, 1

> rowlik <- function(nmat, avec)  (nmat * log(cbind(avec,(1+2*avec)/6,
              (1-avec)/3, 0.5-avec))) %*% rep(1,4)
  rowlikpr <- function(nmat,avec) (nmat/cbind(avec,(1+2*avec)/6,
              (1-avec)/3, 0.5-avec)) %*% c(1,1/3,-1/3,-1)
  rowlikdpr <- function(nmat,avec) (nmat/cbind(avec,(1+2*avec)/6,
              (1-avec)/3, 0.5-avec)^2) %*% c(-1,-1/9,-1/9,-1)
> aold <- rep(0,1e4)
  anew  <- rep(.1,1e4)
  ctr <- 0
  iset <- 1:10000
  while( ctr < 20 & length(iset)) {
       aold[iset] <- anew[iset]
       anew[iset] <- anew[iset] - rowlikpr(nArr[iset,],aold[iset])/
            rowlikdpr(nArr[iset,],aold[iset])
       iset <- iset[abs(anew[iset]-aold[iset]) > 1.e-5]
       ctr <- ctr+1 }
summary(rowlikpr(nArr,anew))
        Min.   1st Qu.  Median       Mean        3rd Qu.      Max. 
 -0.01989776  0.0000    0.0000    -0.00000189  0.00000003  0.00000428
> sum(abs(rowlikpr(nArr,anew)) > 1.e-5)
[1] 1

### Actually the one remaining case, #1230 was FAR from converged !!
> anew[abs(rowlikpr(nArr,anew)) > 1.e-5]
[1] -5025.066
> nArr[1230,]
[1]  4  6 41 49
> optimize(function(a) -rowlik(nArr[1230,],a), c(1e-6,.49))$min
[1] 0.02962868
> anew[1230] <- 0.02962868   ### This is a fairly extreme value !!
### There were also 102 cases where a negative anew value was chosen!
> for(i in (1:10000)[anew < 0]) anew[i] <-  
      optimize(function(a) -rowlik(nArr[i,],a), c(1e-6,.49))$min
> summary(anew)
      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 0.0132118 0.0814582 0.1000000 0.1003715 0.1183398 0.2111745

> T2vec <- (nArr[,1]-100*anew)^2/(100*anew) + 
           (nArr[,2]-100*(1/6+anew/3))^2/(100*(1/6+anew/3)) + 
           (nArr[,3]-(100/3)*(1-anew))^2/((100/3)*(1-anew)) + 
           (nArr[,4]-100*(0.5-anew))^2/(100*(0.5-anew))
      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
 0.0000000 0.2612118 0.5057364 0.4993999 0.7444405 0.9998610
### Again this is close to the desired values for Uniform[0,1]


### We finish off by testing the difference between empirical
###   distribution function values for T1vec and T2vec and 
###   pchisq(.,3) and pchisq(.,3), at the points 1:10.

> T1df <- numeric(10) -> T2df
> for(i in 1:10) {
      T1df[i] <- sum(T1vec <= i)/1e4
      T2df[i] <- sum(T2vec <= i)/1e4 }
> cbind(T1val=T1df, T1CIwidth=round(1.96*sqrt(T1df*(1-T1df))/100,5),
      T2val=T2df, T2CIwidth=round(1.96*sqrt(T2df*(1-T2df))/100,5),
      Chi2 = round(pchisq(1:10,2),4), Chi3 = round(pchisq(1:10,3),4))
       T1val T1CIwidth  T2val T2CIwidth   Chi2   Chi3 
 [1,] 0.1886   0.00767 0.3835   0.00953 0.3935 0.1987
 [2,] 0.4262   0.00969 0.6345   0.00944 0.6321 0.4276
 [3,] 0.6100   0.00956 0.7765   0.00817 0.7769 0.6084
 [4,] 0.7358   0.00864 0.8650   0.00670 0.8647 0.7385
 [5,] 0.8273   0.00741 0.9219   0.00526 0.9179 0.8282
 [6,] 0.8897   0.00614 0.9542   0.00410 0.9502 0.8884
 [7,] 0.9273   0.00509 0.9739   0.00312 0.9698 0.9281
 [8,] 0.9546   0.00408 0.9835   0.00250 0.9817 0.9540
 [9,] 0.9710   0.00329 0.9894   0.00201 0.9889 0.9707
[10,] 0.9814   0.00265 0.9932   0.00161 0.9933 0.9814

## Very interesting results! Shows clearly that T1 is at least very
   close to Chi-square 3df distribution, while T2 is at least very
   close to Chi-square 2df distribution. There MAY be discrepancy
   in each case at distribution argument 1 (but nowehere else), and
   even there the most likely explanation may be some fault of the
   likelihood maximizations which resulted in the smallest values 
   of the paramete  a  .

(20) /* Saved SASsales.asc data as HW20dat, ftp'd to cluster */

 libname home ".";
   data home.HW20;
     infile"HW20dat";
     input Mth Qtr Rep $ Type $ Units Price;
      Amount = Units*Price;
   run;
NOTE: The data set HOME.HW20 has 110 observations and 7 variables.

        proc sort data=home.HW20 ; /* could omit the data = option */
            by QTR REP;

        proc means data=home.HW20 N Mean;
            var amount ;
            by QTR Rep;
        run;

------ Qtr=1 Rep=Hollings -----

       The MEANS Procedure
    Analysis Variable : Amount

      N            Mean
      -----------------
      5         7558.90

------ Qtr=1 Rep=Jones --------

      N            Mean
      -----------------
      5         6451.83

------ Qtr=1 Rep=Smith --------

      N            Mean
      -----------------
      8         9286.86

------ Qtr=2 Rep=Hollings -----

      N            Mean
      -----------------
      6        10314.04
                     
------ Qtr=2 Rep=Jones -------

      N            Mean
      -----------------
      6         8023.23
                     
------ Qtr=2 Rep=Smith -------

      N            Mean
      -----------------
      6        11687.38

------ Qtr=3 Rep=Hollings ----

      N            Mean
      -----------------
     15         4642.82

------- Qtr=3 Rep=Jones ------

      N            Mean
      -----------------
     21         6725.55

------ Qtr=3 Rep=Smith -------

      N            Mean
      -----------------
     21         6886.85

----- Qtr=4 Rep=Hollings -----

      N            Mean
      -----------------
      6         8667.08

----- Qtr=4 Rep=Jones --------

      N            Mean
      -----------------
      6        12245.98
                     
------ Qtr=4 Rep=Smith -------

      N            Mean
      -----------------
      5         7712.67

/* Or alternatively: */

     proc tabulate data=home.hw20;
         class Qtr Rep ;
         var amount;
         table Qtr*Rep, amount*mean amount*n;
      run;


The SAS System            11:09 Friday, April 2, 2004   1

-----------------------------------------------------
|                         |   Amount   |   Amount   |
|                         |------------+------------|
|                         |    Mean    |     N      |
|-------------------------+------------+------------|
|Qtr         |Rep         |            |            |
|------------+------------|            |            |
|1           |Hollings    |     7558.90|        5.00|
|            |------------+------------+------------|
|            |Jones       |     6451.83|        5.00|
|            |------------+------------+------------|
|            |Smith       |     9286.86|        8.00|
|------------+------------+------------+------------|
|2           |Hollings    |    10314.04|        6.00|
|            |------------+------------+------------|
|            |Jones       |     8023.23|        6.00|
|            |------------+------------+------------|
|            |Smith       |    11687.38|        6.00|
|------------+------------+------------+------------|
|3           |Hollings    |     4642.82|       15.00|
|            |------------+------------+------------|
|            |Jones       |     6725.55|       21.00|
|            |------------+------------+------------|
|            |Smith       |     6886.85|       21.00|
|------------+------------+------------+------------|
|4           |Hollings    |     8667.08|        6.00|
|            |------------+------------+------------|
|            |Jones       |    12245.98|        6.00|
|            |------------+------------+------------|
|            |Smith       |     7712.67|        5.00|
-----------------------------------------------------

     proc sort data=home.hw20 out=tmp20;
       by Type Mth Rep;
     data tmpA (drop=ctr) tmpB (drop=ctr) ;
       set tmp20;
       if _N_ = 1 then ctr = 0;
       if Type EQ "Standard" then do; 
            ctr+1; if ctr LT 16 then output tmpA; end;
       else output tmpB;
     proc print data=tmpA; 
     title 'Standard Units Sold, by Mth & Salesrep: 15 cases';
     proc print data=tmpB;
     title 'Deluxe Units Sold, by Mth & Salesrep';
     run;
...           /* OK, worked ! */


(21)         libname home ".";
       data home.PBC (drop=labels1-labels10);
         infile "pbcdata.txt";
         if _N_=1 then input labels1-labels10 $ ;
         else input obs idnum dth evttime treatgp logbili
            agevar cirrh cchol albumin;
         if obs ne . ;
       run;

.. "Invalid data" errors for the fileds in the blank line!!
NOTE: The data set HOME.PBC has 216 observations and 10 variables.

     proc print;
        where obs < 6;
     run;

 Obs  obs  idnum  dth evttime treatgp logbili  agevar cirrh cchol albumin

  1    1    180    0    0       1     1.00000  36.598   0    0      37
  2    2    188    0    0       0     1.36173  29.964   0    0      35
  3    3     56    0    0       1     0.77815  81.451   0    0      36
  4    4     19    0    0       0     1.92428 121.510   1    0      41
  5    5    190    0    0       1     1.00000  60.340   0    0      32

(ii) proc sort data=home.pbc out=home.altpbc;
       by descending treatgp;
     proc means data=home.altpbc mean N ;
       var agevar;
       by descending treatgp;
     run;
/* this works, but a better output format is given by proc tabulate:
       */
     proc tabulate data=home.altpbc;
       class treatgp;
       var agevar;
       table treatgp, agevar*mean, agevar*n ;
     run;

proc sort data=home.pbc out=home.altpbc;
       by descending treatgp;
     proc means data=home.altpbc mean N ;
       var agevar;
       by treatgp;
     run;

 -----------------------------------------------------
 |                         |   agevar   |   agevar   |
 |                         |------------+------------|
 |                         |    Mean    |     N      |
 |-------------------------+------------+------------|
 |treatgp                  |            |            |
 |-------------------------|            |            |
 |0                        |       55.20|      112.00|
 |-------------------------+------------+------------|
 |1                        |       51.07|      104.00|
 -----------------------------------------------------

      data tmpavg (drop=agevar);
    set home.altpbc (keep=agevar treatgp);
        by descending treatgp;
     if first.treatgp then do;
           ctr=0;  agemean=0; end;
     ctr+1; agemean+agevar;
     if last.treatgp then do; 
           agemean=agemean/ctr; output; end;
      proc print;
        run;

       data tmpavg;
    set home.altpbc (keep=agevar treatgp);
     if first.treatgp then do;
           ctr=0;  agemean=0; end;
     ctr+1; agemean+agevar;
     if last.treatgp then do; 
           agemean=agemean/ctr; output; end;
      proc print;
        run;

Obs    treatgp    ctr    agemean

 1        1       104    51.0668
 2        0       112    55.1976O


(iii)   proc univariate data=home.pbc;
          var logbili albumin;
        proc means data=home.pbc;
          var logbili albumin;
        run;
  /* By default, "means" gives only: N mean stddev min max 
     while "univariate" gives quantiles, extremes , t, 
     kurtosis, you-nasme-it. But both have options 
     (to give more or less output.)    */    

(iv) proc sort data=home.pbc out=alt2pbc;
        by dth treatgp cirrh cchol;
     proc means data=alt2pbc min max;
        var logbili;
        by dth treatgp cirrh cchol;
     run;
/* Lots of output in 16 separate segments! 
   Here is the alternative method.  */
     data tmpminmax (drop= logbili); 
       set alt2pbc (keep= dth treatgp cirrh cchol logbili);
       by  dth treatgp cirrh cchol;
     retain logbili;
     if first.cchol then do;
          minlbil=1000; maxlbil=-1000; end;
     minlbil=min(minlbil, logbili);
     maxlbil=max(maxlbil, logbili);
     if last.cchol then output;
       run;
     proc print;
       run;

 Obs    dth    treatgp    cirrh    cchol    minlbil    maxlbil

   1     0        0         0        0      0.84510    2.07188
   2     0        0         0        1      0.84510    2.17319
   3     0        0         1        0      1.11394    1.92428
   4     0        1         0        0      0.69897    1.96379
   5     0        1         0        1      1.74036    2.26951
   6     0        1         1        0      1.20412    1.98677
   7     1        0         0        0      1.00000    2.46538
   8     1        0         0        1      1.47712    2.72263
   9     1        0         1        0      1.20412    2.50786
  10     1        0         1        1      1.44716    2.46835
  11     1        1         0        0      0.77815    2.34635
  12     1        1         0        1      1.25527    2.56110
  13     1        1         1        0      1.32222    2.03342
  14     1        1         1        1      1.71600    2.62634
/* NOTE that not all cases occur !!! */


(22) Splus code to reproduce calculations in BassSAS.txt.

## Getting the data
> Bass <- read.table(file="Bass.Dat", skip=35, header=T)
> names(Bass)
 [1] "ID"                  "Alkalinity"          "pH"                 
 [4] "Calcium"             "Chlorophyll"         "AvgMercury"         
 [7] "No.samples"          "min"                 "max"                
[10] "X3yrStandardMercury" "agedata"            
## interesting that the new version of Splus automatically takes the
## name of the lake as the row label
> names(Bass) <- c("Id","Alk","pH","Calc","Chlor",
      "AvMrc","nsamp","min","max","Mrc3yr","agedat")
> Bass <- cbind.data.frame(logAvM = log(Bass$AvMrc), Bass)

## Plotting would be accomplished by :
  > plot(Bass$Alk, Bass$AvMrc)   ## etc.
> lm1 <- lm(logAvM ~ Alk, data=Bass)

NOTE THAT SAS IN CREATING AN OUTPUT FILE FROM PROC REG ALSO INCLUDES
ALL OF THE ORIGINAL DATASET'S COLUMNS UNLESS YOU SUPPRESS THEM !!

## predictors:
> lm1$fitted[1:7]
  Alligator      Annie    Apopka BlueCypress      Brick     Bryant     Cherry 
 -0.4137453 -0.3760587 -2.142617  -0.9397872 -0.3603559 -0.6288729 -0.4027533
### all OK; and so are the residuals
> lm1$resid[1:7]
 Alligator     Annie    Apopka BlueCypress     Brick     Bryant     Cherry 
 0.6207594 0.6612376 -1.076258   0.1188066 0.5426775 -0.6804605 -0.3312158

> Xm <- model.matrix(lm1)
> summary(lm1)$cov.un
              (Intercept)           Alk 
(Intercept)  0.0374267821 -4.945048e-04
        Alk -0.0004945048  1.317619e-05
> solve(t(Xm) %*% Xm)
              (Intercept)           Alk 
(Intercept)  0.0374267821 -4.945048e-04
        Alk -0.0004945048  1.317619e-05
> Hat <- Xm %*% summary(lm1)$cov.un %*% t(Xm)  ## 53x53 matrix
## Now studentize residual by using standard errors sigma*sqrt(1-h_{ii})
> StudRsd <- (lm1$resid/sqrt(1-diag(Hat)))/summary(lm1)$sigma
  StudRsd[1:7]
 Alligator    Annie    Apopka BlueCypress    Brick    Bryant     Cherry 
  1.064066 1.134668 -1.913229   0.2022827 0.931661 -1.161048 -0.5679219

## Now Cook's Distance:
> CookD <- (StudRsd)^2*(0.5*diag(Hat)/(1-diag(Hat)))
  CookD[1:7]
 Alligator      Annie    Apopka  BlueCypress      Brick     Bryant     Cherry 
0.01874503 0.02274477 0.2033591 0.0003944242 0.01575785 0.01594074 0.00544142

## Finally the confidence limits for mean (CLM) and individual obs (CL)
> SE1 <-  summary(lm1)$sigma*sqrt(diag(Hat))
  SE2 <-  summary(lm1)$sigma*sqrt(1-diag(Hat))
  rbind(LCLM = lm1$fit[1:6] - qt(.975,51)*SE1[1:6], 
        UCLM = lm1$fit[1:6] + qt(.975,51)*SE1[1:6],
        LCL  = lm1$fit[1:6] - qt(.975,51)*SE2[1:6], 
        UCL  = lm1$fit[1:6] + qt(.975,51)*SE2[1:6])
      Alligator      Annie    Apopka BlueCypress      Brick     Bryant 
LCLM -0.6268623 -0.5959707 -2.519064  -1.1035042 -0.5831806 -0.8098173
UCLM -0.2006282 -0.1561466 -1.766171  -0.7760702 -0.1375313 -0.4479284
 LCL -1.5849385 -1.5459950 -3.271954  -2.1189009 -1.5297410 -1.8054660
 UCL  0.7574479  0.7938777 -1.013281   0.2393265  0.8090292  0.5477203

> anova(lm1)                  ### Same info as in main SAS table
Analysis of Variance Table
          Df Sum of Sq  Mean Sq  F Value        Pr(F) 
      Alk  1  18.71377 18.71377 53.22362 1.859181e-09
Residuals 51  17.93193  0.35161                       

> summary(lm1)$coef
                  Value  Std. Error   t value     Pr(>|t|) 
(Intercept) -0.32109907 0.114714868 -2.799106 7.216728e-03
        Alk -0.01570274 0.002152402 -7.295452 1.859181e-09

                  Parameter     Standard
Variable   DF     Estimate      Error    t Value    Pr > |t|

Intercept  1      -0.32110       0.11471   -2.80     0.0072
Alk        1      -0.01570       0.00215   -7.30     <.0001

### Next a plot (omitted) includng fitted line, actual points 
    and CL's and CLM's.

## Next comes the summary statistics for CookD, logMrsd, Studnt :
> round(c(quantile(CookD,prob=c(0,.05,.25,.5,.75,.95,1)),
     sqrt(var(CookD))),4)
   0%    5%    25%   50%    75%    95%   100%        
    0 1e-04 0.0018 0.005 0.0178 0.1185 0.2637 0.0491
> round(c(quantile(lm1$resid,prob=c(0,.05,.25,.5,.75,.95,1)),
     sqrt(var(CookD))),4)
      0%      5%     25%    50%    75%    95%  100%        
 -2.0655 -0.8741 -0.2795 0.0822 0.2923 0.7118 1.792 0.0491
> round(c(quantile(StudRsd,prob=c(0,.05,.25,.5,.75,.95,1)),
     sqrt(var(StudRsd))),4)
      0%      5%     25%    50%    75%    95%   100%       
 -3.5224 -1.4911 -0.4774 0.1409 0.5029 1.2174 3.1037 1.011

> inds <- union((1:53)[CookD > 0.140 | abs(lm1$resid) > 1 | 
      abs(StudRsd) > 1.5], c(36,52))
> inds
[1]  3 18 36 38 40 52
> cbind(data.matrix(Bass[inds,1:3]), logMrsd=lm1$resid[inds], logMprd=
   lm1$fit[inds], CookD=CookD[inds], Studnt = StudRsd[inds])
             logAvM Id   Alk    logMrsd    logMprd      CookD    Studnt 
 Apopka -3.21887582  3 116.0 -1.0762584 -2.1426174 0.20335906 -1.913229
 Harney -0.26136476 18  61.3  1.0223125 -1.2836773 0.04124808  1.747211
 Orange -1.71479843 36  25.4 -0.9948497 -0.7199488 0.03054176 -1.695488
 Parker -3.21887582 38  53.0 -2.0655313 -1.1533445 0.13968778 -3.522399
 Puzzle  0.09531018 40  87.6  1.7919696 -1.6966595 0.26365592  3.103668
Wildcat -1.38629436 52  17.3 -0.7935378 -0.5927565 0.02281806 -1.354791


> summary(lm1)$r.sq  ### = 0.5106676
  lm2 <- update(lm1, data=Bass[setdiff(1:53,inds),])
> summary(lm2)$r.sq
[1] 0.7197122
> rbind(lm1$coef,lm2$coef)
     (Intercept)         Alk 
[1,]  -0.3210991 -0.01570274
[2,]  -0.2656866 -0.01600271

## Now produce Low and Hi predictors at "outlying" points, from model
   defined on 47 non-`outlying' points:
> c(model.matrix(lm1)[inds,] %*% lm2$coef) +  outer(sqrt(1 +  
    diag(Xm[inds,] %*% summary(lm2)$cov.un %*% t(Xm[inds,]))), 
    c(-1,1))* summary(lm2)$sigma*qt(.975,45)
          [,1]       [,2] 
[1,] -2.934843 -1.3091579
[2,] -2.026140 -0.4671655
[3,] -1.448031  0.1037205
[4,] -1.891160 -0.3365000
[5,] -2.458992 -0.8760555
[6,] -1.319657  0.2345897	#### Not exactly the values
   ### previously produced, but close, and reasonable ...

> cbind(data.matrix(Bass[inds,1:3]), fit1=lm1$fit[inds],
+   fit2=c(model.matrix(lm1)[inds,] %*% lm2$coef))
             logAvM Id   Alk       fit1       fit2 
 Apopka -3.21887582  3 116.0 -2.1426174 -2.1220007
 Harney -0.26136476 18  61.3 -1.2836773 -1.2466526
 Orange -1.71479843 36  25.4 -0.7199488 -0.6721554
 Parker -3.21887582 38  53.0 -1.1533445 -1.1138301
 Puzzle  0.09531018 40  87.6 -1.6966595 -1.6675238
Wildcat -1.38629436 52  17.3 -0.5927565 -0.5425335

Further illustration related to Stepwise model-selection !

  proc reg data=bass2;
       model logAvM = Alk pH Calc Chlor /
             selection = forward best=2 ;
     run;
                Summary of Forward Selection

       Variable   Number     Partial    Model
Step   Entered    Vars In    R-Square   R-Square   C(p)  F-Val  Pr>F

1       Alk        1       0.5107     0.5107     29.7014  53.22  <.0001
2     Chlor        2       0.1734     0.6841      3.8085  27.45  <.0001
3     Calc         3       0.0154     0.6995      3.3266   2.52  0.1191


(25) > hist(geyser$waiting, nclass=20, prob=T, xlab="Waiting Times",
  ylab="Scaled Relative Frequency", main=
  "Geyser Waiting Times, Histogram & Overplotted Density Estimates")
> 
> gpts <- seq(min(geyser$waiting), max(geyser$waiting), length=50)
  kerMatg1 <- outer(gpts, geyser$waiting, 
      function(x,y) dlogis(x,y,.5))
  kerMatg2 <- outer(gpts, geyser$waiting, 
      function(x,y) dlogis(x,y,3))
  lines(gpts, c(kerMatg1 %*% rep(1/299,299)), lty=1)
  lines(gpts, c(kerMatg2 %*% rep(1/299,299)), lty=4)
  legend(locator(), legend=c("Bandwidth .5", "Bandwidth 3"), lty=c(1,4))

> lLk2 <- function(x, pwt, mn, scal, dens = dnorm) {
   dnsmat <- matrix(dens(outer(mn,x,"-")/scal)/scal, ncol=length(x))
   sum(log(pwt %*% dnsmat)) }
> tmpft <- nlmin(function(w)  -lLk2(log(geyser$waiting),
   c(plogis(w[1]), 1-plogis(w[1])), w[2:3], exp(w[4:5])), 
   c(0.5,3.95,4.35,log(.2),log(.2)), print.level=1, max.fcal=300, 
   max.iter=100)
> tmpft
$x:
[1] -0.7073344  4.0009645  4.3892868 -2.3041704 -2.4458907
$converged:
[1] T
$conv.type:
[1] "relative function convergence"

### Thus we have convergence to a mixture of two normals (for the
    log-waiting times) with proportions 0.3302, .6698 of
    |N(4.0010, 0.0998^2)  and  |N(4.3893, 0.0866^2)

### Now let's try a mixture of three normals:

> tmpft3 <- nlmin(function(w)  -lLk2(log(geyser$waiting),
   c(plogis(w[1]), plogis(w[2])*(1-plogis(w[1])), (1-plogis(w[1]))*
   (1-plogis(w[2]))), w[3:5], exp(w[6:8])), c(0.5,0.5,3.95,4.15,4.35,
   rep(log(.2),3)), print.level=1, max.fcal=300, max.iter=100)
> tmpft3
$x:
[1] -1.2087436 -0.9080385  3.9655844  4.2238913  4.3998657 -2.5495674 
[7] -1.7914989 -2.5778582
$converged:
[1] T
$conv.type:
[1] "relative function convergence"
### This gives normal with probabilty weights
> c(plogis(tmpft3$x[1]),(1-plogis(tmpft3$x[1]))*plogis(tmpft3$x[2]),
    (1-plogis(tmpft3$x[1]))*(1-plogis(tmpft3$x[2])))
[1] 0.2299234 0.2213211 0.5487555
### and means and scale parameters respectively given by
> rbind(means=tmpft3$x[3:5], sdev=exp(tmpft3$x[6:8]))
            [,1]      [,2]       [,3] 
means 3.96558439 4.2238913 4.39986566
 sdev 0.07811545 0.1667101 0.07593647

> hist(geyser$waiting, nclass=20, prob=T, xlab="Waiting Times",
  ylab="Scaled Relative Frequency", main=
  "Geyser Waiting Times, Histogram & Overplotted Density Estimates")

### We overplot the two and three component log-normals:
> fmix
function(y, pwt, mn, scal)
{
   prb <- if(length(pwt) == 2) c(plogis(pwt[1]), (1 - plogis(pwt[
        1])) * plogis(pwt[2]), (1 - plogis(pwt[1])) * (1 - plogis(
        pwt[2]))) else c(plogis(pwt[1]), (1 - plogis(pwt[1])))
   c(prb%*% matrix(dnorm(outer(mn,log(y),"-")/scal)/scal, ncol=length(y)))/y
}

> lines(seq(43,108,.5), fmix(seq(43,108,.5),tmpft$x[1], 
     tmpft$x[2:3], exp(tmpft$x[4:5])), lty=1)
> lines(seq(43,108,.5), fmix(seq(43,108,.5),tmpft3$x[1:2], 
     tmpft3$x[3:5], exp(tmpft3$x[6:8])), lty=3)

### To compare the fit done both ways:
### logLik for 2-component and for 3-component models: 
> sum(log(fmix(geyser$waiting,tmpft$x[1], tmpft$x[2:3],
   exp(tmpft$x[4:5]))))
[1] -1155.923
> sum(log(fmix(geyser$waiting,tmpft3$x[1:2], tmpft3$x[3:5],
   exp(tmpft3$x[6:8]))))
[1] -1154.563  
### So the fit is only a tiny bit `better' with the third component, 
###    definitely not enough of an improvement over 2-component
###    to be worth bothering with !

(26)
> library(MASS)
> rubberlm <- lm(loss ~ ., data=Rubber)
> rubberlm$coef
 (Intercept)     hard      tens 
    885.1611 -6.57083 -1.374312
### The simplest rule based on the linear regression
###   is to predict each (hard,tens) observation as above-median
###   if   885.1611-6.57083*hard -1.374312*tens > 165.
### One the original data this yields
> table(rubberlm$fit > 165, Rubber$loss > 165)
      FALSE TRUE 
FALSE    14    1
 TRUE     1   14   ### so only 2 are misclassified.

## The second method is logistic regression:
> Above<- as.numeric(Rubber$loss > 165)
  rubberlgs <- glm(cbind(Above, 1-Above) ~ hard+tens,
     data=Rubber, family=binomial)
> rubberlgs$coef
 (Intercept)       hard       tens 
    57.24511 -0.5125454 -0.1120316
> table(rubberlgs$fit > 0.5, Rubber$loss > 165)
      FALSE TRUE 
FALSE    14    2
 TRUE     1   13   ### now 3 are misclassified

## Third method: predict based on kernel NPR value > 165
> kerRubber <- outer(rubberlm$fit,rubberlm$fit, function(x,y) 
     dlogis(x,y,40))    ## 30x30 matrix:
  kerRubfit <- c(kerRubber %*% Rubber$loss)/c(kerRubber %*% rep(1,30)) 
> table(kerRubfit > 165, Rubber$loss > 165)
      FALSE TRUE 
FALSE    14    3
 TRUE     1   12   ### now 4 are misclassified

> RubbAlt <- cbind.data.frame(Rubber, binvec=
      as.numeric(Rubber$loss > 165))
> Prdct3 <- function(trainfram,testfram) {
    trainlm <- lm(loss ~ hard+tens, data=trainfram)
    testlmfit <- trainlm$coef[1]+c(data.matrix(testfram[,2:3]) %*%
       trainlm$coef[2:3])
    trainlgs <-  glm(cbind(binvec, 1-binvec) ~ hard+tens,
       data=trainfram, family=binomial)
    testlgsc <- trainlgs$coef[1]+c(data.matrix(testfram[,2:3]) %*%
       trainlgs$coef[2:3])
    kertrain <- outer(testlmfit, trainlm$fit, function(x,y) 
       dlogis(x,y,40))  
    kertest <- c(kertrain %*% trainfram$loss)/c(kertrain %*% 
       rep(1,nrow(trainfram)))
    list(lm.mis = sum( (testlmfit > 165) != (testfram$loss > 165)),
       glm.mis = sum( (testlgsc>0) != (testfram$loss > 165)),
       ker.mis = sum( (kertest > 165) != (testfram$loss > 165))) }
> unlist(Prdct3(RubbAlt,RubbAlt))
 lm.mis glm.mis ker.mis 
      2       3       4  ## Reproduces previous calculation
### This completes part (i). NOTE that these estimates of mis-
###  classification rates out of 30 will turn out to be too low!

NOTE: it turns out in further testing with this function that --
   as pointed out to me by Sam Younkin -- there are several 
   cases of cross-validation training samples in which the logistic
   fit is "unstable": these are fits in which the
   (not-quite-converged) models are trying to fit essentially 
   0-or-1 predictions but where there are still some 
   wrongly-predicted cases!

There were 50 warnings (use warnings() to see them)

> Misclass <- array(0, dim=c(1000,3), dimnames=list(NULL, 
       c("lm.mis","glm.mis","ker.mis")))  
  for(i in 1:1000) {
     lvout <- sample(30,5)
     lvin <- setdiff(1:30,lvout)
     Misclass[i,] <- unlist(Prdct4(RubbAlt[lvin,],RubbAlt[lvout,]))}
> apply(Misclass,2,sum)/5000
 lm.mis glm.mis ker.mis 
 0.1148  0.1142  0.1264   ### These are the estimated misclassifi-
   ### cation rates, suggesting that "linear" works best for these 
   ### data and that roughly one in 8 points are missclassified. 
   ### (Actually, we would need a considerably larger simulation 
   ### study to confirm this, and that is constrained by the remark 
   ### that {30 choose 5} = 142506 .

*** Now for bootstraps (part (iii)). The basis for this is that we
    have 3 `statistics' (to be spelled out now) for estimation
    of  P(loss > 165 | hard, tens)  based on a given data-set (data
    frame with columns  loss, hard, tens).
The statistics are: 
(a) from normal linear regression:
       1 - Phi ((165 - b0 - b1*hard - b2*tens)/sigmahat)
(b) from logistic regression:
       plogis( c0 + c1*hard + c2*tens )
(c) from kernel method (by analogy with NPR approach)
     sum_i I[loss_i > 165] * ker((x- b0 - b1*hard_i - b2*tens_i)/40)/     
           sum_i ker((x- b0 - b1*hard_i - b2*tens_i)/40)

These are the statistics we bootstrap to get confidence intervals.

> Estim3 <- function(evalfram, bootfram) {
    bootlm <- lm(loss ~ hard+tens, data=bootfram)
    evallmfit <- bootlm$coef[1]+c(data.matrix(evalfram[,2:3]) %*%
       bootlm$coef[2:3])
    lmest <- 1-pnorm((165 - evallmfit)/summary(bootlm)$sigma)
   ## binvec <- as.numeric(bootfram$loss > 165)
    bootlgs <-  glm(cbind(binvec, 1-binvec) ~ hard+tens,
       data=bootfram, family=binomial)
    glmest <- plogis(bootlgs$coef[1]+c(data.matrix(evalfram[,2:3]) %*%
       bootlgs$coef[2:3]))
    kerboot <- outer(evallmfit, bootlm$fit, function(x,y) 
       dlogis(x,y,40)) 
    kerest <- c(kerboot %*% bootfram$binvec)/c(kerboot %*% 
       rep(1,nrow(bootfram)))
    list(lm.est = lmest, glm.est =glmest, ker.est = kerest) }

> Estim3(RubbAlt[1:5,],RubbAlt)
$lm.est:
[1] 0.99999998 0.85462905 0.50546810 0.19793889 0.04011242
$glm.est:
[1] 0.999999891 0.950345758 0.497125366 0.078551388 0.006529196
$ker.est:
[1] 0.9593606 0.6186340 0.4408652 0.3167944 0.2251193

### This completes the definition of the statistics. Now we bootstrap,
###   which means that the bootfram data will be generated by
###   the different with-replacement (nonparametric bootstrap) 
###   resampling strategies, while evalfram will always be Rubber[1:5,].

> Boot2mat <- Boot1mat <- array(0, dim=c(4000,5,3))

> for (i in 1:4000) 
     Boot1mat[i,,] <- matrix(unlist(Estim3(RubbAlt[1:5,], RubbAlt[
        sample(30,30, replace=T),])), ncol=3)
### This was the simplest Bootstrap strategy: pure with-replacement
###     sampling of triples.

### Next consider what is probably the most appealing method for
###     bootstrapping a dataset which approximately satisfies a 
###     linear regression model with additive iid errors. We begin
###     by setting up the linear regression & residuals from the
###     original data, then bootstrapping (hard,tens) with replacement
###     and independently bootstrapping residuals with replacement.
> resids <- rubberlm$resid
  yhats  <- rubberlm$fit

> for (i in 1:4000) {
      inds1 <- sample(30,30,replace=T)
      inds2 <- sample(30,30,replace=T)
      Boot2mat[i,,] <- matrix(unlist(Estim3(RubbAlt[1:5,], 
        cbind.data.frame(loss=yhats[inds1]+resids[inds2],
        RubbAlt[inds1,2:4]))), ncol=4) }
### In each of these loops, there were warnings (of exactly the 
###  same type as in the cross-validation steps) from the logistic-
###  regression  glm  function.

> M1 <- round( apply(Boot1mat,2:3, function(x) quantile(x,.025)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 1.0000 0.8583
[2,] 0.6636 0.5107 0.4253
[3,] 0.2621 0.0000 0.2609
[4,] 0.0574 0.0000 0.1600
[5,] 0.0033 0.0000 0.0959

> M2 <- round( apply(Boot1mat,2:3, function(x) quantile(x,.975)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 1.0000 0.9868
[2,] 0.9680 1.0000 0.7747
[3,] 0.6986 1.0000 0.6166
[4,] 0.3266 0.5820 0.4934
[5,] 0.0898 0.0686 0.3942

> M3 <- round( apply(Boot2mat,2:3, function(x) quantile(x,.025)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 0.9999 0.8604
[2,] 0.6324 0.5483 0.3842
[3,] 0.2482 0.0000 0.2410
[4,] 0.0445 0.0000 0.1521
[5,] 0.0022 0.0000 0.0921

> M4 <- round( apply(Boot2mat,2:3, function(x) quantile(x,.975)),4)
       [,1]   [,2]   [,3] 
[1,] 1.0000 1.0000 0.9882
[2,] 0.9836 1.0000 0.8174
[3,] 0.7670 1.0000 0.6655
[4,] 0.3935 0.6294 0.5291
[5,] 0.1183 0.0698 0.4147

### Still have to make confidence intervals out of these quantiles 
  and the estimated quantities above (from the original dataset.)

### Of course, instead of bootstrapping the residuals we could have
    taken the parametric approach of simulating them as normally
    distributed with means 0 and variance summary(rubberlm)$sigma^2,
    but that is very similar and less appealing !

OK, for constructing CI's we focus on the function of the data which
results in the estimated probability P(loss > 165 | hard,
tens). Unfortunately, there is no nonparametric functional which gives
a consistent estimator, and this is not only a problem of the small
sample size (30), but also a problem of the type of (conditional!)
probability we are estimating. The main issue is that in a
nonparametric context, we have only the observations precisely at the
specific (hard,tens) value to help us estimate this conditional
probability. 
*** So I sent you all a long message indicating that the functionals
    T  that we are estimating are those indicated above (the one based
    on normal linear regression, the one based on logistic regression,
    and the nonparametric kernel regression using the linear
    combination of the hard and tens variables estimated from linear
*** regression. 

### We complete this analysis by generating an array of confidence
    inteervals, as follows: 5 points by 3 estimators by 2 bootstrap
    methods by (lower,upper) CI endpoints.

> Rubber[1:5,2:3]
  hard tens 
1   45  162
2   55  233
3   61  232
4   66  231
5   71  231

> CIarr <- array(0, dim=c(5,3,2,2), dimnames=list(paste("Pt",
      1:5,sep=""), c("lin.reg","lgst.reg","ker.NPR"),
      c("Lower","Upper"), c("Bootstr.a", "Bootstr.c")))
> Ests <- matrix(unlist(Estim3(RubbAlt[1:5,],RubbAlt)), ncol=3)
> Ests
           [,1]        [,2]      [,3] 
[1,] 0.99999998 0.999999891 0.9593606
[2,] 0.85462905 0.950345758 0.6186340
[3,] 0.50546810 0.497125366 0.4408652
[4,] 0.19793889 0.078551388 0.3167944
[5,] 0.04011242 0.006529196 0.2251193

> CIarr[,,1,1] <- 2*Ests- M2    ### This is the nonparametric 
  CIarr[,,2,1] <- 2*Ests- M1   ### bootstrap form of CI's
  CIarr[,,1,2] <- 2*Ests- M4
  CIarr[,,2,2] <- 2*Ests- M3

> round(pmin(pmax(CIarr,0),1),3)
  , , Lower, Bootstr.a
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   0.932
Pt2   0.741    0.901   0.463
Pt3   0.312    0.000   0.265
Pt4   0.069    0.000   0.140
Pt5   0.000    0.000   0.056

, , Upper, Bootstr.a
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   1.000
Pt2   1.000    1.000   0.812
Pt3   0.749    0.994   0.621
Pt4   0.338    0.157   0.474
Pt5   0.077    0.013   0.354

, , Lower, Bootstr.c
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   0.931
Pt2   0.726    0.901   0.420
Pt3   0.244    0.000   0.216
Pt4   0.002    0.000   0.104
Pt5   0.000    0.000   0.036

, , Upper, Bootstr.c
    lin.reg lgst.reg ker.NPR 
Pt1   1.000    1.000   1.000
Pt2   1.000    1.000   0.853
Pt3   0.763    0.994   0.641
Pt4   0.351    0.157   0.481
Pt5   0.078    0.013   0.358