HW 18 Solutions --------------- R log for Bass data analysis > bass = read.table("tmpBass.txt", header=T) ### could have used the SAS file imported with read.ssd ## Response variable is log(AvMrc) > logAvM = log(bass$AvgMerc)) plot(bass$Alk,logAvM, xlab="Alk", ylab="logAvMrc", main="Scatterplot, bass data, log(AvMrc) vs. Alk") ## Clear negative tendency, but with some ### outlying observations not well described by ### univariate functional relation. > tmplm = lm(logAvM ~ Alkalinity, data=bass) anova(tmplm) Analysis of Variance Table Response: logAvM Df Sum Sq Mean Sq F value Pr(>F) Alkalinity 1 18.7138 18.7138 53.224 1.859e-09 *** Residuals 51 17.9319 0.3516 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## NOTE that `corrected total' is simply > (53-1)*var(logAvM) [1] 36.64571 ## Residual sum of squares is also: sum(tmplm$resid^2) ## Output file "bass3" contained predictor, residual, CookD, ## studentized residuals, and lower and upper conf limits ## means at each observation. ## We next calculate these using information from `tmplm' > c(summary(tmplm)$sigma^2, sum(tmplm$resid^2)/51) [1] 0.3516065 0.3516065 ## sigma-hat^2 calculated two ways > Xmat = model.matrix(tmplm) ## 53 x 2 design matrix sum(abs(solve(t(Xmat)%*% Xmat)-summary(tmplm)$cov.unsc)) [1] 3.578884e-17 > Hatmat = Xmat %*% summary(tmplm)$cov.unsc %*% t(Xmat) ## Hat matrix H = X (X^{tr} X)^{-1} X^{tr}, nxn ## Vector of standard errors for predicted means is > devpred = sqrt(diag(Hatmat)*sum(tmplm$resid^2)/51) devresd = sqrt((1-diag(Hatmat))*sum(tmplm$resid^2)/51) stdresd = tmplm$resid/devresd ## Now standard errors for residuals > bass3 = cbind(logMprd = tmplm$fit, logMrsd = tmplm$resid, Cookd = stdresd^2*(diag(Hatmat)/(1-diag(Hatmat)))/2, Studnt=stdresd, LCLM = tmplm$fit - qt(.975,51)*devpred, UCLM = tmplm$fit + qt(.975,51)*devpred) #### 53 x 6 > dimnames(bass3)[[2]] [1] "logMprd" "logMrsd" "Cookd" "Studnt" "LCLM" "UCLM" ## For parameter estimates and significance, > summary(tmplm)$coef Estimate Std. Error t value Pr(>|t|) (Intercept) -0.32109907 0.114714868 -2.799106 7.216728e-03 Alkalinity -0.01570274 0.002152402 -7.295452 1.859181e-09 ## (b) contains scatterplot of residuals from linear ## model fit overplotted with prediction line and ## horizontal reference line at 0. > plot(bass$Alk,logAvM, xlab="Alk", ylab="log(AvgMerc)", main="Observations and Predictions from regression") abline(h=0) lines(bass$Alk,bass3[,"logMprd"], lty=3) ### (c) sumary statistics for the cookd, logMrsd, and Studnt ### output columns of "bass3" matrix > t(apply(bass3[,c(3,2,4)],2, function(col) round(c(min=min(col), max=max(col), qrang=sum(c(-1,1)*quantile(col, c(.25,.75))), stddev=sd(col), quantile(col, .95), quantile(col, .05)), 3))) min max qrang stddev 95% 5% Cookd 0.000 0.264 0.016 0.049 0.119 0.000 logMrsd -2.066 1.792 0.572 0.587 0.712 -0.874 Studnt -3.522 3.104 0.980 1.011 1.217 -1.491 ### NB: the interpolation used for the 5th and 95 percentiles ### is evidently different for SAS and R, but everything else ### checks out: you can see in the R documentation that there ### is a "SAS definition" type=3 which gives exact SAS result! > t(apply(bass3[,c(3,2,4)],2, function(col) round(c(-quantile(-col, .05, type=3), quantile(col, .05, type=3)), 3))) Cookd 0.140 0.000 logMrsd 0.788 -0.995 Studnt 1.341 -1.695 ## Next display columns 1:4 of bass3 for selected observations: > inds = bass3[,"Cookd"]> .140 | abs(bass3[,"logMrsd"])>1 | bass$ID %in% c(36,52) | bass3[,"Studnt"]> 1.5 round(cbind(bass$Alk[inds], logAvM=logAvM[inds], bass3[inds, 1:4]),5) logAvM logMprd logMrsd Cookd Studnt 3 116.0 -3.21888 -2.14262 -1.07626 0.20336 -1.91323 18 61.3 -0.26136 -1.28368 1.02231 0.04125 1.74721 36 25.4 -1.71480 -0.71995 -0.99485 0.03054 -1.69549 38 53.0 -3.21888 -1.15334 -2.06553 0.13969 -3.52240 40 87.6 0.09531 -1.69666 1.79197 0.26366 3.10367 52 17.3 -1.38629 -0.59276 -0.79354 0.02282 -1.35479 ## Now part (d): > c(r.sq=summary(tmplm)$r.sq, adjr.sq=summary(tmplm)$adj.r.sq) r.sq adjr.sq 0.5106676 0.5010728 > 1-sum(tmplm$resid^2)/(52*var(logAvM)) [1] 0.5106676 > inds2 = bass3[,"Cookd"]> .140 | abs(bass3[,"logMrsd"])>1 | bass$ID %in% c(36,52) newlm = lm(log(AvgMercury) ~ Alkalinity, data=bass[!inds2,]) > newlm$coef (Intercept) Alkalinity -0.26568664 -0.01600271 ### formerly: -0.3211, -0.0157 > c(r.sq=summary(newlm)$r.sq, adjr.sq=summary(newlm)$adj.r.sq) r.sq adjr.sq 0.7197122 0.7134836