Factors and Contrasts, and then Bayesian Example
================================================ 11/6/17

Obtained data "lirat" from package VGAM, consisting of 58 4-tuples concerning 
litters of rats in an experiment to link low-iron to malformation and death.

> library(VGAM)

> help(lirat)
N
Litter size.

R
Number of dead fetuses.

hb
Hemoglobin level.

grp
Group number. Group 1 is the untreated (low-iron) group, 
group 2 received injections on day 7 or day 10 only, 
group 3 received injections on days 0 and 7, and 
group 4 received injections weekly.

> table(lirat$grp)
 1  2  3  4 
31 12  5 10 

>  sapply(lirat, class)
        N         R        hb       grp 
"integer" "integer" "numeric" "integer" 

> contrasts(factor(lirat$grp))
  2 3 4
1 0 0 0
2 1 0 0
3 0 1 0
4 0 0 1

> sapply(esoph,class)
$agegp
[1] "ordered" "factor" 

$alcgp
[1] "ordered" "factor" 

$tobgp
[1] "ordered" "factor" 

$ncases
[1] "numeric"

$ncontrols
[1] "numeric"

> contrasts(esoph$agegp)
             .L         .Q         .C         ^4          ^5
[1,] -0.5976143  0.5455447 -0.3726780  0.1889822 -0.06299408
[2,] -0.3585686 -0.1091089  0.5217492 -0.5669467  0.31497039
[3,] -0.1195229 -0.4364358  0.2981424  0.3779645 -0.62994079
[4,]  0.1195229 -0.4364358 -0.2981424  0.3779645  0.62994079
[5,]  0.3585686 -0.1091089 -0.5217492 -0.5669467 -0.31497039
[6,]  0.5976143  0.5455447  0.3726780  0.1889822  0.06299408

>    t(contrasts(tmp2)) %*% contrasts(tmp2)
             .L            .Q            .C            ^4            ^5
.L 1.000000e+00  0.000000e+00  2.775558e-17  9.714451e-17  4.857226e-17
.Q 0.000000e+00  1.000000e+00 -1.110223e-16  1.249001e-16 -8.326673e-17
.C 2.775558e-17 -1.110223e-16  1.000000e+00 -1.526557e-16  3.469447e-18
^4 9.714451e-17  1.249001e-16 -1.526557e-16  1.000000e+00  1.040834e-16
^5 4.857226e-17 -8.326673e-17  3.469447e-18  1.040834e-16  1.000000e+00


> tmp = factor(sample(1:6,20,replace=T))
    tmp2 = C(tmp, poly)
    contrasts(tmp2)
             .L         .Q         .C         ^4          ^5
[1,] -0.5976143  0.5455447 -0.3726780  0.1889822 -0.06299408
[2,] -0.3585686 -0.1091089  0.5217492 -0.5669467  0.31497039
[3,] -0.1195229 -0.4364358  0.2981424  0.3779645 -0.62994079
[4,]  0.1195229 -0.4364358 -0.2981424  0.3779645  0.62994079
[5,]  0.3585686 -0.1091089 -0.5217492 -0.5669467 -0.31497039
[6,]  0.5976143  0.5455447  0.3726780  0.1889822  0.06299408
   #### This type of "polynomial" contrasts is the default for an ordered factor of 6 levels
  ### and determines how the model matrix is filled.

>  tmp3 = C(tmp, helmert)
   contrasts(tmp3)
  [,1] [,2] [,3] [,4] [,5]
1   -1   -1   -1   -1   -1
2    1   -1   -1   -1   -1
3    0    2   -1   -1   -1
4    0    0    3   -1   -1
5    0    0    0    4   -1
6    0    0    0    0    5

> x = factor(sample(6, 100, replace=T))
  y = rnorm(100, 1+2*as.numeric(x), 1)
  x2 = C(x,helmert)
  dfr = cbind.data.frame(x=x2, y=y)
  tmpfit = lm(y ~ x, data=dfr)
  summary(tmpfit)
Call:
lm(formula = y ~ C(factor(x), helmert))

Residuals:
     Min       1Q   Median       3Q      Max 
-2.23200 -0.65663 -0.02958  0.49959  2.12823 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.90023    0.09771  80.855  < 2e-16 ***
x1           0.97695    0.16470   5.932 4.95e-08 ***
x2           0.69519    0.11380   6.109 2.25e-08 ***
x3           0.93159    0.06977  13.352  < 2e-16 ***
x4           1.02174    0.05233  19.524  < 2e-16 ***
x5           1.04814    0.03709  28.261  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9457 on 94 degrees of freedom
Multiple R-squared:  0.9399,    Adjusted R-squared:  0.9367 
F-statistic: 294.1 on 5 and 94 DF,  p-value: < 2.2e-16

> summary(lm(y~x))
Call:
lm(formula = y ~ x)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.23200 -0.65663 -0.02958  0.49959  2.12823 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.2266     0.2294  14.068  < 2e-16 ***
x2            1.9539     0.3294   5.932 4.95e-08 ***
x3            3.0625     0.3769   8.126 1.74e-12 ***
x4            5.3985     0.3294  16.389  < 2e-16 ***
x5            7.7124     0.3244  23.777  < 2e-16 ***
x6            9.9143     0.2998  33.072  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9457 on 94 degrees of freedom
Multiple R-squared:  0.9399,    Adjusted R-squared:  0.9367 
F-statistic: 294.1 on 5 and 94 DF,  p-value: < 2.2e-16

> cbind(xval=x[1:10], model.matrix(tmpfit)[1:10,])
   xval (Intercept) x1 x2 x3 x4 x5
1     1           1 -1 -1 -1 -1 -1
2     1           1 -1 -1 -1 -1 -1
3     3           1  0  2 -1 -1 -1
4     2           1  1 -1 -1 -1 -1
5     5           1  0  0  0  4 -1
6     2           1  1 -1 -1 -1 -1
7     6           1  0  0  0  0  5
8     3           1  0  2 -1 -1 -1
9     5           1  0  0  0  4 -1
10    6           1  0  0  0  0  5

## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 
## RETURN TO  lirat  EXAMPLE
## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

> aggregate(lirat$R/lirat$N, by=list(lirat$grp), mean)
  Group.1          
1       1 0.77632600
2       2 0.10287826
3       3 0.03246753
4       4 0.04112979

> lirat2 = lirat; lirat2$grp = factor(5-lirat$grp)
> contrasts(lirat2$grp)
  2 3 4
1 0 0 0
2 1 0 0
3 0 1 0
4 0 0 1
> aggregate(lirat2$R/lirat2$N, by=list(lirat2$grp), mean)
  Group.1          x
1       1 0.04112979
2       2 0.03246753
3       3 0.10287826
4       4 0.77632600

> fitR = glm(cbind(R, N-R) ~ hb + grp, family=binomial, data=lirat2)
  anova(fitR)
Analysis of Deviance Table
Model: binomial, link: logit

Response: cbind(R, N - R)
Terms added sequentially (first to last)

     Df Deviance Resid. Df Resid. Dev
NULL                    57     509.43
hb    1  303.706        56     205.73
grp   3   36.819        53     168.91

### Now let's try something stepwise using the model matrix of this fit
> model.matrix(fitR)[1:10,]
   (Intercept)  hb grp2 grp3 grp4
1            1 4.1    0    0    1
2            1 3.2    0    0    1
3            1 4.7    0    0    1
4            1 3.5    0    0    1
5            1 3.2    0    0    1
6            1 5.9    0    0    1
7            1 4.7    0    0    1
8            1 4.7    0    0    1
9            1 3.5    0    0    1
10           1 4.8    0    0    1

> lirat3 = cbind.data.frame(lirat[,1:3], model.matrix(fitR)[,3:5])
> names(lirat3)
[1] "N"    "R"    "hb"   "grp2" "grp3" "grp4"

> fitR2 = step(glm(cbind(R,N-R) ~ hb, family=binomial, data=lirat3), 
                 scope=list(upper=cbind(R,N-R) ~ hb+grp2+grp3+grp4), 
                 direction="forward", k=4)

Start:  AIC=285.2
cbind(R, N - R) ~ hb

       Df Deviance    AIC
+ grp4  1   170.57 254.04
+ grp3  1   184.79 268.26
<none>      205.73 285.20
+ grp2  1   202.88 286.35

Step:  AIC=254.04
cbind(R, N - R) ~ hb + grp4

       Df Deviance    AIC
<none>      170.57 254.04
+ grp2  1   169.19 256.66
+ grp3  1   170.44 257.91

> fitR2
Call:  glm(formula = cbind(R, N - R) ~ hb + grp4, family = binomial, 
    data = lirat3)

Coefficients:
(Intercept)           hb         grp4  
    -0.6239      -0.1871       2.6509  

Degrees of Freedom: 57 Total (i.e. Null);  55 Residual
Null Deviance:      509.4 
Residual Deviance: 170.6        AIC: 248

> c(logLik(fitR), logLik(fitR2))
[1] -120.1888 -121.0219