Log for Solution of  HW 5,  Stat 798c            4/18/03
=======================================

(I) The wording of the problem is somewhat unclear (sorry!).
(1) You are asked first for the average amount sold by each 
sales-rep, by month: in retrospect, what this should mean is 
that the data are sorted over sales-rep and month and that the 
average of the monthly total sales amounts for each rep are 
printed out (a vector or `table' of three numbers).
(2) Next you are asked for the average amount sold, by quarter.
In this case, the best interpretation is that the data are sorted by 
month, quarterly totals are calculated and the average of the 4 
quarterly amounts printed out (1 number).
(3) Totals are easier to interpret: here the data are to be sorted 
by Price & Type, and totals within each sorted category printed
out, omitting month and quarter and sales-rep.
(4) Here we print the subset of Deluxe-type data, sorted: no 
interpretation needed for this one.

First comes the Splus code (done in R, starting from ASCII file 
containing the data with 1 header-line), then the SAS, then the 
SAS edited printouts.

> salesdat <- read.table("SASdata.txt", header=T)
> names(salesdat)
[1] "Mth"     "Qtr"     "Rep"     "Type"    "Units"   "Price"   "amtsold"
[1] "Mth"   "Qtr"   "Rep"   "Type"  "Units" "Price"
> salesdat <- cbind.data.frame(salesdat, amtsold=salesdat[,5]*salesdat[,6])
> cat("Average Monthly Sales by Rep \n")
Average Monthly Sales by Rep 
> aggregate.data.frame(salesdat[order(salesdat[,3]),7],
     by=list(Rep=salesdat[order(salesdat[,3]),3]),mean)
            Rep        x
1 Hollingsworth 6916.359
2         Jones 7766.079
3         Smith 8190.158
> cat("Average of Total Quarterly Sales Results \n", 
  sum(salesdat[,7]), "\n")
Average of Total Quarterly Sales Results 
 844040.8 
> cat("Total Units Sold, by Price and Type \n")
> tmpord <- order(interaction(factor(salesdat[,6]),salesdat[,3]))
> aggregate.data.frame(data.frame(Units=salesdat[tmpord,5]), by=list(
     Price=salesdat[tmpord,6], type=salesdat[tmpord,4]),sum)
  Price     type Units
1  29.5   Deluxe  2525
2 19.95 Standard 38234
3  29.5 Standard   230
> salesdat2 <- salesdat[salesdat[,"Type"]=="Deluxe",c(1,3,5:7)]
> tmpord2 <- order(interaction(salesdat2[,3], factor(salesdat2[,1]))
> cat("Deluxe Sales by Month and Rep \n")
> salesdat2[tmpord2,]
    Mth           Rep Units Price amtsold
1     1 Hollingsworth   260  29.5  7670.0
5     1         Smith   715  29.5 21092.5
8     2         Smith    10  29.5   295.0
32    6 Hollingsworth    25  29.5   737.5
40    7 Hollingsworth    30  29.5   885.0
38    7 Hollingsworth    60  29.5  1770.0
48    7 Hollingsworth    60  29.5  1770.0
67    8         Jones    45  29.5  1327.5
72    8 Hollingsworth    50  29.5  1475.0
57    8         Smith   110  29.5  3245.0
69    8         Smith   110  29.5  3245.0
75    8         Jones   225  29.5  6637.5
82    9         Jones   275  29.5  8112.5
91    9         Jones   275  29.5  8112.5
105  11 Hollingsworth   150  29.5  4425.0
108  12 Hollingsworth   125  29.5  3687.5

### Now begin SAS code: easiest approach is to use PROC TABULATE

[ assume data-step given in exercise has first been used to create
working SAS dataset  "allyear" and column amtsold=units*price. ]

NOTE: The data set WORK.ALLYEAR has 110 observations and 7 variables.

proc sort out=allyear2;
  by salesrep mth;
data allyear2 (drop= amtsold) ;
  set allyear2 (drop=qtr units price type);
  by salesrep mth;
  if first.mth then Totsold = 0;
  Totsold + amtsold;
  if last.mth then output;
proc tabulate data=allyear2;
 title 'Average amt sold per salesrep by month';
  class salesrep mth;
  var Totsold ;
  table salesrep, Totsold*mean;
run;
proc sort data=allyear out=allyear3;
   by qtr;
data allyear3;
  set allyear3 (keep=qtr amtsold);
   by qtr;
   if first.qtr then Totsold=0;
   Totsold + amtsold;
   if last.qtr then output;
proc tabulate data=allyear3;
  title 'Average quarterly sales';
  var Totsold;
  table Totsold*mean;
run;


proc sort data=allyear out=allyear4;
  by price type;
data allyear4 (drop=mth qtr salesrep );
  set allyear;
  by price type;
  if first.price or first.type then totprtyp=0; 
   /* wouldn't it work with just first.type ? */
  totprtyp+units;
  if last.price or last.type; 
/* wouldn't it work with just last.type ? */
proc print;
  title 'Totals by Price & Type'; run;

            Totals by Price & Type      

  Obs      type      units    price    amtsold    totprtyp

  1     Standard     1000    19.95    19950.0   38234
  2     Deluxe        125    29.50     3687.5       2525
  3     Standard      230    29.50     6785.0        230

proc sort data=allyear out=allyear5;
  by mth salesrep;
data allyear5;
  set allyear5;
  if type="Deluxe";
proc print;
  title 'Deluxe Units Sold, by Mth & Salesrep';
  run;
        Deluxe Units Sold, by Mth & Salesrep

Obs  mth   qtr    salesrep       type     units  price  amtsold

1    01     1     Hollingsworth    Deluxe   260  29.5  7670.0
2    01     1     Smith            Deluxe   715  29.5 21092.5
3    02     1     Smith            Deluxe    10  29.5   295.0
4    06     2     Hollingsworth    Deluxe    25  29.5   737.5 
5    07     3     Hollingsworth    Deluxe    60  29.5  1770.0
6    07     3     Hollingsworth    Deluxe    30  29.5   885.0
7    07     3     Hollingsworth    Deluxe    60  29.5  1770.0
8    08     3     Hollingsworth    Deluxe    50  29.5  1475.0
9    08     3     Jones            Deluxe    45  29.5  1327.5
10   08     3     Jones            Deluxe   225  29.5  6637.5
11   08     3     Smith            Deluxe   110  29.5  3245.0
12   08     3     Smith            Deluxe   110  29.5  3245.0
13   09     3     Jones            Deluxe   275  29.5  8112.5
14   09     3     Jones            Deluxe   275  29.5  8112.5
15   11     4     Hollingsworth    Deluxe   150  29.5  4425.0
16   12     4     Hollingsworth    Deluxe   125  29.5  3687.5


(II) Splus analysis of forbes dataset --- done in R
> forbes <- read.table("forbes", skip=3)
> dimnames(forbes)[[2]] <- c("boilpt","pressure")
> plot(forbes[,1],forbes[,2], xlab="Boiling Pt", ylab="Pressure",
    main="Scatterplot of Pressure vs Boiling Pt, forbes data")
> fitlm <- lm(pressure ~ boilpt, data=forbes)
> lines(forbes[,1], fitlm$fit, lty=3)
	### Line looks like great fit !! But even though residuals
	### are small, they look positive at either end of the axis,
	### indicating a slight nonlinearity, EXCEPT for 
	### one striking outlier near the middle:
> identify(forbes[,1],forbes[,2])
[1] 12
> forbes[12,]
   boilpt pressure
12  204.6    26.57
> fitlm$fit[12]
      12 
25.92006   ### How can we tell if this residual is really large ?
	### Standardize it ! We present the five largest absolute
     ### standardized residuals and their index values:
> hatmat <- solve(t(model.matrix(fitlm)) %*% model.matrix(fitlm))
> hatmat
           [,1]         [,2]
[1,] 77.6610575 -0.382365653
[2,] -0.3823657  0.001884011 
### Agrees with summary(fitlm($cov.unscaled))
> stdrsd <- fitlm$resid/(summary(fitlm)$sigma*sqrt(diag(
     model.matrix(fitlm) %*% hatmat %*% t(model.matrix(fitlm)))))
> rbind(rev(order(abs(stdrsd)))[1:5] -> inds, stdrsd[inds])
           12        11         9        14        10
[1,] 12.00000 11.000000  9.000000 14.000000 10.000000
[2,] 11.04000 -4.523838 -3.869706 -3.134178 -3.133269
### This tells us both that there are too many large residuals 
###   and that the 12th one is wildly larger than the others.
> lines(forbes[,1], lm(pressure ~ boilpt + I(boilpt^2), 
     data=forbes)$fitted, lty=6)
### But looking at either standardized coeff's or anova shows
###   that this model does not really fit better.

(b) We transform to log(pressure):
> fitlm2 <- lm(I(log(pressure)) ~ boilpt, data=forbes)
> plot(forbes[,1],forbes[,2], xlab="Boiling Pt", ylab="Pressure",
    main="Scatterplot of Pressure vs Boiling Pt, forbes data")
  lines(forbes[,1], exp(fitlm2$fit), lty=3)
### The fit is better, especially to all points except the 12th ! 
> c(sum(fitlm$resid^2),sum((forbes[,2]-exp(fitlm2$fit))^2), 
	fitlm$resid[12]^2)
[1] 0.8131430 0.7820001 0.4224245

(c) > fitlm3 <- lm(I(log(pressure)) ~ boilpt, data=forbes[-12,])
> plot(forbes[-12,1],forbes[-12,2], xlab="Boiling Pt", ylab=
   "Pressure",    main=
   "Scatterplot of Pressure vs Boiling Pt, forbes data")
  lines(forbes[-12,1], exp(fitlm2$fit[-12]), lty=3)
> sum((forbes[-12,2]-exp(fitlm3$fit))^2)
[1] 0.06069679
> plot(forbes[-12,1], fitlm3$resid, 
     xlab="Boiling Point", ylab="Residuals", main=
     "forbes data Residual Plot without Outlier")
  abline(h=0)

(d) Prediction intervals: at boilpt=200.5,
> exp(c(1,200.5) %*% fitlm3$coef + 2.576*c(-1,1)*summary(
     fitlm3)$sigma * sqrt(1+ c(1,200.5) %*% (summary(fitlm3)
     $cov.un %*% c(1,200.5))))
[1] 23.45831 23.78805 ###
> plot(forbes[-12,1],forbes[-12,2], xlab="Boiling Pt", ylab=
    "Pressure",    main=
    "Scatterplot of Pressure vs Boiling Pt, forbes data")
   lines(forbes[-12,1], exp(fitlm2$fit[-12]), lty=3)
  abline(v=200.5)
  points(rep(200.5,2), c(23.458, 23.788), pch=18)
### This interval seems very reasonable ! Now do it at 150:
> exp(c(1,150) %*% fitlm3$coef + 2.576*c(-1,1)*summary(
     fitlm3)$sigma * sqrt(1+ c(1,150) %*% (summary(fitlm3)
     $cov.un %*% c(1,150))))
[1]  8.240272 8.524980 ### Might still be reasonable, 
###  but probably extrapolated much too far outside the 
###  range of observations to be trusted !!!