Homework Solutions to Problems 22. ================================== [NB solution is in Splus 6.0] Splus code to reproduce calculations in BassSAS.txt. ## Getting the data > Bass <- read.table(file="Bass.Dat", skip=35, header=T) > names(Bass) [1] "ID" "Alkalinity" "pH" [4] "Calcium" "Chlorophyll" "AvgMercury" [7] "No.samples" "min" "max" [10] "X3yrStandardMercury" "agedata" ## interesting that the new version of Splus automatically takes the ## name of the lake as the row label > names(Bass) <- c("Id","Alk","pH","Calc","Chlor", "AvMrc","nsamp","min","max","Mrc3yr","agedat") > Bass <- cbind.data.frame(logAvM = log(Bass$AvMrc), Bass) ## Plotting would be accomplished by : > plot(Bass$Alk, Bass$AvMrc) ## etc. > lm1 <- lm(logAvM ~ Alk, data=Bass) NOTE THAT SAS IN CREATING AN OUTPUT FILE FROM PROC REG ALSO INCLUDES ALL OF THE ORIGINAL DATASET'S COLUMNS UNLESS YOU SUPPRESS THEM !! ## predictors: > lm1$fitted[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry -0.4137453 -0.3760587 -2.142617 -0.9397872 -0.3603559 -0.6288729 -0.4027533 ### all OK; and so are the residuals > lm1$resid[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry 0.6207594 0.6612376 -1.076258 0.1188066 0.5426775 -0.6804605 -0.3312158 > Xm <- model.matrix(lm1) > summary(lm1)$cov.un (Intercept) Alk (Intercept) 0.0374267821 -4.945048e-04 Alk -0.0004945048 1.317619e-05 > solve(t(Xm) %*% Xm) (Intercept) Alk (Intercept) 0.0374267821 -4.945048e-04 Alk -0.0004945048 1.317619e-05 > Hat <- Xm %*% summary(lm1)$cov.un %*% t(Xm) ## 53x53 matrix ## Now studentize residual by using standard errors sigma*sqrt(1-h_{ii}) > StudRsd <- (lm1$resid/sqrt(1-diag(Hat)))/summary(lm1)$sigma StudRsd[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry 1.064066 1.134668 -1.913229 0.2022827 0.931661 -1.161048 -0.5679219 ## Now Cook's Distance: > CookD <- (StudRsd)^2*(0.5*diag(Hat)/(1-diag(Hat))) CookD[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry 0.01874503 0.02274477 0.2033591 0.0003944242 0.01575785 0.01594074 0.00544142 ## Finally the confidence limits for mean (CLM) and individual obs (CL) > SE1 <- summary(lm1)$sigma*sqrt(diag(Hat)) SE2 <- summary(lm1)$sigma*sqrt(1-diag(Hat)) rbind(LCLM = lm1$fit[1:6] - qt(.975,51)*SE1[1:6], UCLM = lm1$fit[1:6] + qt(.975,51)*SE1[1:6], LCL = lm1$fit[1:6] - qt(.975,51)*SE2[1:6], UCL = lm1$fit[1:6] + qt(.975,51)*SE2[1:6]) Alligator Annie Apopka BlueCypress Brick Bryant LCLM -0.6268623 -0.5959707 -2.519064 -1.1035042 -0.5831806 -0.8098173 UCLM -0.2006282 -0.1561466 -1.766171 -0.7760702 -0.1375313 -0.4479284 LCL -1.5849385 -1.5459950 -3.271954 -2.1189009 -1.5297410 -1.8054660 UCL 0.7574479 0.7938777 -1.013281 0.2393265 0.8090292 0.5477203 > anova(lm1) ### Same info as in main SAS table Analysis of Variance Table Df Sum of Sq Mean Sq F Value Pr(F) Alk 1 18.71377 18.71377 53.22362 1.859181e-09 Residuals 51 17.93193 0.35161 > summary(lm1)$coef Value Std. Error t value Pr(>|t|) (Intercept) -0.32109907 0.114714868 -2.799106 7.216728e-03 Alk -0.01570274 0.002152402 -7.295452 1.859181e-09 Parameter Standard Variable DF Estimate Error t Value Pr > |t| Intercept 1 -0.32110 0.11471 -2.80 0.0072 Alk 1 -0.01570 0.00215 -7.30 <.0001 ### Next a plot (omitted) includng fitted line, actual points and CL's and CLM's. ## Next comes the summary statistics for CookD, logMrsd, Studnt : > round(c(quantile(CookD,prob=c(0,.05,.25,.5,.75,.95,1)), sqrt(var(CookD))),4) 0% 5% 25% 50% 75% 95% 100% 0 1e-04 0.0018 0.005 0.0178 0.1185 0.2637 0.0491 > round(c(quantile(lm1$resid,prob=c(0,.05,.25,.5,.75,.95,1)), sqrt(var(CookD))),4) 0% 5% 25% 50% 75% 95% 100% -2.0655 -0.8741 -0.2795 0.0822 0.2923 0.7118 1.792 0.0491 > round(c(quantile(StudRsd,prob=c(0,.05,.25,.5,.75,.95,1)), sqrt(var(StudRsd))),4) 0% 5% 25% 50% 75% 95% 100% -3.5224 -1.4911 -0.4774 0.1409 0.5029 1.2174 3.1037 1.011 > inds <- union((1:53)[CookD > 0.140 | abs(lm1$resid) > 1 | abs(StudRsd) > 1.5], c(36,52)) > inds [1] 3 18 36 38 40 52 > cbind(data.matrix(Bass[inds,1:3]), logMrsd=lm1$resid[inds], logMprd= lm1$fit[inds], CookD=CookD[inds], Studnt = StudRsd[inds]) logAvM Id Alk logMrsd logMprd CookD Studnt Apopka -3.21887582 3 116.0 -1.0762584 -2.1426174 0.20335906 -1.913229 Harney -0.26136476 18 61.3 1.0223125 -1.2836773 0.04124808 1.747211 Orange -1.71479843 36 25.4 -0.9948497 -0.7199488 0.03054176 -1.695488 Parker -3.21887582 38 53.0 -2.0655313 -1.1533445 0.13968778 -3.522399 Puzzle 0.09531018 40 87.6 1.7919696 -1.6966595 0.26365592 3.103668 Wildcat -1.38629436 52 17.3 -0.7935378 -0.5927565 0.02281806 -1.354791 > summary(lm1)$r.sq ### = 0.5106676 lm2 <- update(lm1, data=Bass[setdiff(1:53,inds),]) > summary(lm2)$r.sq [1] 0.7197122 > rbind(lm1$coef,lm2$coef) (Intercept) Alk [1,] -0.3210991 -0.01570274 [2,] -0.2656866 -0.01600271 ## Now produce Low and Hi predictors at "outlying" points, from model defined on 47 non-`outlying' points: > c(model.matrix(lm1)[inds,] %*% lm2$coef) + outer(sqrt(1 + diag(Xm[inds,] %*% summary(lm2)$cov.un %*% t(Xm[inds,]))), c(-1,1))* summary(lm2)$sigma*qt(.975,45) [,1] [,2] [1,] -2.934843 -1.3091579 [2,] -2.026140 -0.4671655 [3,] -1.448031 0.1037205 [4,] -1.891160 -0.3365000 [5,] -2.458992 -0.8760555 [6,] -1.319657 0.2345897 #### Not exactly the values ### previously produced, but close, and reasonable ... > cbind(data.matrix(Bass[inds,1:3]), fit1=lm1$fit[inds], + fit2=c(model.matrix(lm1)[inds,] %*% lm2$coef)) logAvM Id Alk fit1 fit2 Apopka -3.21887582 3 116.0 -2.1426174 -2.1220007 Harney -0.26136476 18 61.3 -1.2836773 -1.2466526 Orange -1.71479843 36 25.4 -0.7199488 -0.6721554 Parker -3.21887582 38 53.0 -1.1533445 -1.1138301 Puzzle 0.09531018 40 87.6 -1.6966595 -1.6675238 Wildcat -1.38629436 52 17.3 -0.5927565 -0.5425335 Further illustration related to Stepwise model-selection ! proc reg data=bass2; model logAvM = Alk pH Calc Chlor / selection = forward best=2 ; run; Summary of Forward Selection Variable Number Partial Model Step Entered Vars In R-Square R-Square C(p) F-Val Pr>F 1 Alk 1 0.5107 0.5107 29.7014 53.22 <.0001 2 Chlor 2 0.1734 0.6841 3.8085 27.45 <.0001 3 Calc 3 0.0154 0.6995 3.3266 2.52 0.1191