Homework Solutions to Problems 22.
==================================
[NB solution is in Splus 6.0]

Splus code to reproduce calculations in BassSAS.txt.

## Getting the data
> Bass <- read.table(file="Bass.Dat", skip=35, header=T)
> names(Bass)
 [1] "ID"                  "Alkalinity"          "pH"                 
 [4] "Calcium"             "Chlorophyll"         "AvgMercury"         
 [7] "No.samples"          "min"                 "max"                
[10] "X3yrStandardMercury" "agedata"            
## interesting that the new version of Splus automatically takes the
## name of the lake as the row label
> names(Bass) <- c("Id","Alk","pH","Calc","Chlor",
      "AvMrc","nsamp","min","max","Mrc3yr","agedat")
> Bass <- cbind.data.frame(logAvM = log(Bass$AvMrc), Bass)

## Plotting would be accomplished by :
  > plot(Bass$Alk, Bass$AvMrc)   ## etc.
> lm1 <- lm(logAvM ~ Alk, data=Bass)

NOTE THAT SAS IN CREATING AN OUTPUT FILE FROM PROC REG ALSO INCLUDES
ALL OF THE ORIGINAL DATASET'S COLUMNS UNLESS YOU SUPPRESS THEM !!

## predictors:
> lm1$fitted[1:7]
  Alligator      Annie    Apopka BlueCypress      Brick     Bryant     Cherry 
 -0.4137453 -0.3760587 -2.142617  -0.9397872 -0.3603559 -0.6288729 -0.4027533
### all OK; and so are the residuals
> lm1$resid[1:7]
 Alligator     Annie    Apopka BlueCypress     Brick     Bryant     Cherry 
 0.6207594 0.6612376 -1.076258   0.1188066 0.5426775 -0.6804605 -0.3312158

> Xm <- model.matrix(lm1)
> summary(lm1)$cov.un
              (Intercept)           Alk 
(Intercept)  0.0374267821 -4.945048e-04
        Alk -0.0004945048  1.317619e-05
> solve(t(Xm) %*% Xm)
              (Intercept)           Alk 
(Intercept)  0.0374267821 -4.945048e-04
        Alk -0.0004945048  1.317619e-05
> Hat <- Xm %*% summary(lm1)$cov.un %*% t(Xm)  ## 53x53 matrix
## Now studentize residual by using standard errors sigma*sqrt(1-h_{ii})
> StudRsd <- (lm1$resid/sqrt(1-diag(Hat)))/summary(lm1)$sigma
  StudRsd[1:7]
 Alligator    Annie    Apopka BlueCypress    Brick    Bryant     Cherry 
  1.064066 1.134668 -1.913229   0.2022827 0.931661 -1.161048 -0.5679219

## Now Cook's Distance:
> CookD <- (StudRsd)^2*(0.5*diag(Hat)/(1-diag(Hat)))
  CookD[1:7]
 Alligator      Annie    Apopka  BlueCypress      Brick     Bryant     Cherry 
0.01874503 0.02274477 0.2033591 0.0003944242 0.01575785 0.01594074 0.00544142

## Finally the confidence limits for mean (CLM) and individual obs (CL)
> SE1 <-  summary(lm1)$sigma*sqrt(diag(Hat))
  SE2 <-  summary(lm1)$sigma*sqrt(1-diag(Hat))
  rbind(LCLM = lm1$fit[1:6] - qt(.975,51)*SE1[1:6], 
        UCLM = lm1$fit[1:6] + qt(.975,51)*SE1[1:6],
        LCL  = lm1$fit[1:6] - qt(.975,51)*SE2[1:6], 
        UCL  = lm1$fit[1:6] + qt(.975,51)*SE2[1:6])
      Alligator      Annie    Apopka BlueCypress      Brick     Bryant 
LCLM -0.6268623 -0.5959707 -2.519064  -1.1035042 -0.5831806 -0.8098173
UCLM -0.2006282 -0.1561466 -1.766171  -0.7760702 -0.1375313 -0.4479284
 LCL -1.5849385 -1.5459950 -3.271954  -2.1189009 -1.5297410 -1.8054660
 UCL  0.7574479  0.7938777 -1.013281   0.2393265  0.8090292  0.5477203

> anova(lm1)                  ### Same info as in main SAS table
Analysis of Variance Table
          Df Sum of Sq  Mean Sq  F Value        Pr(F) 
      Alk  1  18.71377 18.71377 53.22362 1.859181e-09
Residuals 51  17.93193  0.35161                       

> summary(lm1)$coef
                  Value  Std. Error   t value     Pr(>|t|) 
(Intercept) -0.32109907 0.114714868 -2.799106 7.216728e-03
        Alk -0.01570274 0.002152402 -7.295452 1.859181e-09

                  Parameter     Standard
Variable   DF     Estimate      Error    t Value    Pr > |t|

Intercept  1      -0.32110       0.11471   -2.80     0.0072
Alk        1      -0.01570       0.00215   -7.30     <.0001

### Next a plot (omitted) includng fitted line, actual points 
    and CL's and CLM's.

## Next comes the summary statistics for CookD, logMrsd, Studnt :
> round(c(quantile(CookD,prob=c(0,.05,.25,.5,.75,.95,1)),
     sqrt(var(CookD))),4)
   0%    5%    25%   50%    75%    95%   100%        
    0 1e-04 0.0018 0.005 0.0178 0.1185 0.2637 0.0491
> round(c(quantile(lm1$resid,prob=c(0,.05,.25,.5,.75,.95,1)),
     sqrt(var(CookD))),4)
      0%      5%     25%    50%    75%    95%  100%        
 -2.0655 -0.8741 -0.2795 0.0822 0.2923 0.7118 1.792 0.0491
> round(c(quantile(StudRsd,prob=c(0,.05,.25,.5,.75,.95,1)),
     sqrt(var(StudRsd))),4)
      0%      5%     25%    50%    75%    95%   100%       
 -3.5224 -1.4911 -0.4774 0.1409 0.5029 1.2174 3.1037 1.011

> inds <- union((1:53)[CookD > 0.140 | abs(lm1$resid) > 1 | 
      abs(StudRsd) > 1.5], c(36,52))
> inds
[1]  3 18 36 38 40 52
> cbind(data.matrix(Bass[inds,1:3]), logMrsd=lm1$resid[inds], logMprd=
   lm1$fit[inds], CookD=CookD[inds], Studnt = StudRsd[inds])
             logAvM Id   Alk    logMrsd    logMprd      CookD    Studnt 
 Apopka -3.21887582  3 116.0 -1.0762584 -2.1426174 0.20335906 -1.913229
 Harney -0.26136476 18  61.3  1.0223125 -1.2836773 0.04124808  1.747211
 Orange -1.71479843 36  25.4 -0.9948497 -0.7199488 0.03054176 -1.695488
 Parker -3.21887582 38  53.0 -2.0655313 -1.1533445 0.13968778 -3.522399
 Puzzle  0.09531018 40  87.6  1.7919696 -1.6966595 0.26365592  3.103668
Wildcat -1.38629436 52  17.3 -0.7935378 -0.5927565 0.02281806 -1.354791


> summary(lm1)$r.sq  ### = 0.5106676
  lm2 <- update(lm1, data=Bass[setdiff(1:53,inds),])
> summary(lm2)$r.sq
[1] 0.7197122
> rbind(lm1$coef,lm2$coef)
     (Intercept)         Alk 
[1,]  -0.3210991 -0.01570274
[2,]  -0.2656866 -0.01600271

## Now produce Low and Hi predictors at "outlying" points, from model
   defined on 47 non-`outlying' points:
> c(model.matrix(lm1)[inds,] %*% lm2$coef) +  outer(sqrt(1 +  
    diag(Xm[inds,] %*% summary(lm2)$cov.un %*% t(Xm[inds,]))), 
    c(-1,1))* summary(lm2)$sigma*qt(.975,45)
          [,1]       [,2] 
[1,] -2.934843 -1.3091579
[2,] -2.026140 -0.4671655
[3,] -1.448031  0.1037205
[4,] -1.891160 -0.3365000
[5,] -2.458992 -0.8760555
[6,] -1.319657  0.2345897	#### Not exactly the values
   ### previously produced, but close, and reasonable ...

> cbind(data.matrix(Bass[inds,1:3]), fit1=lm1$fit[inds],
+   fit2=c(model.matrix(lm1)[inds,] %*% lm2$coef))
             logAvM Id   Alk       fit1       fit2 
 Apopka -3.21887582  3 116.0 -2.1426174 -2.1220007
 Harney -0.26136476 18  61.3 -1.2836773 -1.2466526
 Orange -1.71479843 36  25.4 -0.7199488 -0.6721554
 Parker -3.21887582 38  53.0 -1.1533445 -1.1138301
 Puzzle  0.09531018 40  87.6 -1.6966595 -1.6675238
Wildcat -1.38629436 52  17.3 -0.5927565 -0.5425335

Further illustration related to Stepwise model-selection !

  proc reg data=bass2;
       model logAvM = Alk pH Calc Chlor /
             selection = forward best=2 ;
     run;
                Summary of Forward Selection

       Variable   Number     Partial    Model
Step   Entered    Vars In    R-Square   R-Square   C(p)  F-Val  Pr>F

1       Alk        1       0.5107     0.5107     29.7014  53.22  <.0001
2     Chlor        2       0.1734     0.6841      3.8085  27.45  <.0001
3     Calc         3       0.0154     0.6995      3.3266   2.52  0.1191