Log for Solution of HW 5, Stat 798c 4/18/03 ======================================= (I) The wording of the problem is somewhat unclear (sorry!). (1) You are asked first for the average amount sold by each sales-rep, by month: in retrospect, what this should mean is that the data are sorted over sales-rep and month and that the average of the monthly total sales amounts for each rep are printed out (a vector or `table' of three numbers). (2) Next you are asked for the average amount sold, by quarter. In this case, the best interpretation is that the data are sorted by month, quarterly totals are calculated and the average of the 4 quarterly amounts printed out (1 number). (3) Totals are easier to interpret: here the data are to be sorted by Price & Type, and totals within each sorted category printed out, omitting month and quarter and sales-rep. (4) Here we print the subset of Deluxe-type data, sorted: no interpretation needed for this one. First comes the Splus code (done in R, starting from ASCII file containing the data with 1 header-line), then the SAS, then the SAS edited printouts. > salesdat <- read.table("SASdata.txt", header=T) > names(salesdat) [1] "Mth" "Qtr" "Rep" "Type" "Units" "Price" "amtsold" [1] "Mth" "Qtr" "Rep" "Type" "Units" "Price" > salesdat <- cbind.data.frame(salesdat, amtsold=salesdat[,5]*salesdat[,6]) > cat("Average Monthly Sales by Rep \n") Average Monthly Sales by Rep > aggregate.data.frame(salesdat[order(salesdat[,3]),7], by=list(Rep=salesdat[order(salesdat[,3]),3]),mean) Rep x 1 Hollingsworth 6916.359 2 Jones 7766.079 3 Smith 8190.158 > cat("Average of Total Quarterly Sales Results \n", sum(salesdat[,7]), "\n") Average of Total Quarterly Sales Results 844040.8 > cat("Total Units Sold, by Price and Type \n") > tmpord <- order(interaction(factor(salesdat[,6]),salesdat[,3])) > aggregate.data.frame(data.frame(Units=salesdat[tmpord,5]), by=list( Price=salesdat[tmpord,6], type=salesdat[tmpord,4]),sum) Price type Units 1 29.5 Deluxe 2525 2 19.95 Standard 38234 3 29.5 Standard 230 > salesdat2 <- salesdat[salesdat[,"Type"]=="Deluxe",c(1,3,5:7)] > tmpord2 <- order(interaction(salesdat2[,3], factor(salesdat2[,1])) > cat("Deluxe Sales by Month and Rep \n") > salesdat2[tmpord2,] Mth Rep Units Price amtsold 1 1 Hollingsworth 260 29.5 7670.0 5 1 Smith 715 29.5 21092.5 8 2 Smith 10 29.5 295.0 32 6 Hollingsworth 25 29.5 737.5 40 7 Hollingsworth 30 29.5 885.0 38 7 Hollingsworth 60 29.5 1770.0 48 7 Hollingsworth 60 29.5 1770.0 67 8 Jones 45 29.5 1327.5 72 8 Hollingsworth 50 29.5 1475.0 57 8 Smith 110 29.5 3245.0 69 8 Smith 110 29.5 3245.0 75 8 Jones 225 29.5 6637.5 82 9 Jones 275 29.5 8112.5 91 9 Jones 275 29.5 8112.5 105 11 Hollingsworth 150 29.5 4425.0 108 12 Hollingsworth 125 29.5 3687.5 ### Now begin SAS code: easiest approach is to use PROC TABULATE [ assume data-step given in exercise has first been used to create working SAS dataset "allyear" and column amtsold=units*price. ] NOTE: The data set WORK.ALLYEAR has 110 observations and 7 variables. proc sort out=allyear2; by salesrep mth; data allyear2 (drop= amtsold) ; set allyear2 (drop=qtr units price type); by salesrep mth; if first.mth then Totsold = 0; Totsold + amtsold; if last.mth then output; proc tabulate data=allyear2; title 'Average amt sold per salesrep by month'; class salesrep mth; var Totsold ; table salesrep, Totsold*mean; run; proc sort data=allyear out=allyear3; by qtr; data allyear3; set allyear3 (keep=qtr amtsold); by qtr; if first.qtr then Totsold=0; Totsold + amtsold; if last.qtr then output; proc tabulate data=allyear3; title 'Average quarterly sales'; var Totsold; table Totsold*mean; run; proc sort data=allyear out=allyear4; by price type; data allyear4 (drop=mth qtr salesrep ); set allyear; by price type; if first.price or first.type then totprtyp=0; /* wouldn't it work with just first.type ? */ totprtyp+units; if last.price or last.type; /* wouldn't it work with just last.type ? */ proc print; title 'Totals by Price & Type'; run; Totals by Price & Type Obs type units price amtsold totprtyp 1 Standard 1000 19.95 19950.0 38234 2 Deluxe 125 29.50 3687.5 2525 3 Standard 230 29.50 6785.0 230 proc sort data=allyear out=allyear5; by mth salesrep; data allyear5; set allyear5; if type="Deluxe"; proc print; title 'Deluxe Units Sold, by Mth & Salesrep'; run; Deluxe Units Sold, by Mth & Salesrep Obs mth qtr salesrep type units price amtsold 1 01 1 Hollingsworth Deluxe 260 29.5 7670.0 2 01 1 Smith Deluxe 715 29.5 21092.5 3 02 1 Smith Deluxe 10 29.5 295.0 4 06 2 Hollingsworth Deluxe 25 29.5 737.5 5 07 3 Hollingsworth Deluxe 60 29.5 1770.0 6 07 3 Hollingsworth Deluxe 30 29.5 885.0 7 07 3 Hollingsworth Deluxe 60 29.5 1770.0 8 08 3 Hollingsworth Deluxe 50 29.5 1475.0 9 08 3 Jones Deluxe 45 29.5 1327.5 10 08 3 Jones Deluxe 225 29.5 6637.5 11 08 3 Smith Deluxe 110 29.5 3245.0 12 08 3 Smith Deluxe 110 29.5 3245.0 13 09 3 Jones Deluxe 275 29.5 8112.5 14 09 3 Jones Deluxe 275 29.5 8112.5 15 11 4 Hollingsworth Deluxe 150 29.5 4425.0 16 12 4 Hollingsworth Deluxe 125 29.5 3687.5 (II) Splus analysis of forbes dataset --- done in R > forbes <- read.table("forbes", skip=3) > dimnames(forbes)[[2]] <- c("boilpt","pressure") > plot(forbes[,1],forbes[,2], xlab="Boiling Pt", ylab="Pressure", main="Scatterplot of Pressure vs Boiling Pt, forbes data") > fitlm <- lm(pressure ~ boilpt, data=forbes) > lines(forbes[,1], fitlm$fit, lty=3) ### Line looks like great fit !! But even though residuals ### are small, they look positive at either end of the axis, ### indicating a slight nonlinearity, EXCEPT for ### one striking outlier near the middle: > identify(forbes[,1],forbes[,2]) [1] 12 > forbes[12,] boilpt pressure 12 204.6 26.57 > fitlm$fit[12] 12 25.92006 ### How can we tell if this residual is really large ? ### Standardize it ! We present the five largest absolute ### standardized residuals and their index values: > hatmat <- solve(t(model.matrix(fitlm)) %*% model.matrix(fitlm)) > hatmat [,1] [,2] [1,] 77.6610575 -0.382365653 [2,] -0.3823657 0.001884011 ### Agrees with summary(fitlm($cov.unscaled)) > stdrsd <- fitlm$resid/(summary(fitlm)$sigma*sqrt(diag( model.matrix(fitlm) %*% hatmat %*% t(model.matrix(fitlm))))) > rbind(rev(order(abs(stdrsd)))[1:5] -> inds, stdrsd[inds]) 12 11 9 14 10 [1,] 12.00000 11.000000 9.000000 14.000000 10.000000 [2,] 11.04000 -4.523838 -3.869706 -3.134178 -3.133269 ### This tells us both that there are too many large residuals ### and that the 12th one is wildly larger than the others. > lines(forbes[,1], lm(pressure ~ boilpt + I(boilpt^2), data=forbes)$fitted, lty=6) ### But looking at either standardized coeff's or anova shows ### that this model does not really fit better. (b) We transform to log(pressure): > fitlm2 <- lm(I(log(pressure)) ~ boilpt, data=forbes) > plot(forbes[,1],forbes[,2], xlab="Boiling Pt", ylab="Pressure", main="Scatterplot of Pressure vs Boiling Pt, forbes data") lines(forbes[,1], exp(fitlm2$fit), lty=3) ### The fit is better, especially to all points except the 12th ! > c(sum(fitlm$resid^2),sum((forbes[,2]-exp(fitlm2$fit))^2), fitlm$resid[12]^2) [1] 0.8131430 0.7820001 0.4224245 (c) > fitlm3 <- lm(I(log(pressure)) ~ boilpt, data=forbes[-12,]) > plot(forbes[-12,1],forbes[-12,2], xlab="Boiling Pt", ylab= "Pressure", main= "Scatterplot of Pressure vs Boiling Pt, forbes data") lines(forbes[-12,1], exp(fitlm2$fit[-12]), lty=3) > sum((forbes[-12,2]-exp(fitlm3$fit))^2) [1] 0.06069679 > plot(forbes[-12,1], fitlm3$resid, xlab="Boiling Point", ylab="Residuals", main= "forbes data Residual Plot without Outlier") abline(h=0) (d) Prediction intervals: at boilpt=200.5, > exp(c(1,200.5) %*% fitlm3$coef + 2.576*c(-1,1)*summary( fitlm3)$sigma * sqrt(1+ c(1,200.5) %*% (summary(fitlm3) $cov.un %*% c(1,200.5)))) [1] 23.45831 23.78805 ### > plot(forbes[-12,1],forbes[-12,2], xlab="Boiling Pt", ylab= "Pressure", main= "Scatterplot of Pressure vs Boiling Pt, forbes data") lines(forbes[-12,1], exp(fitlm2$fit[-12]), lty=3) abline(v=200.5) points(rep(200.5,2), c(23.458, 23.788), pch=18) ### This interval seems very reasonable ! Now do it at 150: > exp(c(1,150) %*% fitlm3$coef + 2.576*c(-1,1)*summary( fitlm3)$sigma * sqrt(1+ c(1,150) %*% (summary(fitlm3) $cov.un %*% c(1,150)))) [1] 8.240272 8.524980 ### Might still be reasonable, ### but probably extrapolated much too far outside the ### range of observations to be trusted !!!