DATA DESCRIPTION:

Log to Examine Alternative Models and Fits       
for Kyphosis Data is LogLinear vs GLM Setting
===================================== 5/10/03

> kyphA <- read.table("kyphosis", skip=8,  header=T)
> dim(kyphA)
[1] 81  5		## This is correct: now recode 
		###  to get categorical variables
> names(kyphA)
[1] "Id"       "Kyphosis" "Age"      "Number"   "Start"   

> Ncat <- ifelse(kyphA$Number < 4, "A", ifelse(kyphA
	$Number > 5, "C","B"))
> table(Ncat)
Ncat
 A  B  C 
35 35 11 

> Scat <- ifelse(kyphA$Start < 8, "a", ifelse(kyphA
    $Start > 13, "c", "b"))
> table(Scat)
Scat
 a  b  c 
17 30 34 
> Acat <- ifelse(kyphA$Age <96, ifelse(kyphA$Age < 31,1,2), 
    ifelse(kyphA$Age < 137,3,4))
> table(Acat)
Acat
 1  2  3  4 
22 21 21 17 

> kyphB <- cbind.data.frame(kyph=factor(kyphA$Kyphosis),
 Acat=factor(Acat), Ncat=factor(Ncat), Scat=factor(Scat))
### Now all four categories are factors.
> levels(kyphB$kyph)
[1] "absent"  "present"  ### So "absent" is coded 1, and
	### response indicator in glm is "present"

####   We have two data-structures, the original with 
### ordered (numerical) predictors and the second with 
### ordered categories. We compare the analysis of the 
### latter, in R, by glm and loglin.

    ### We begin by doing exploratory analysis with the
### categorically re-coded variables.

> glmB <- glm(kyph ~ ., family=binomial, data=kyphB)
> anova(glmB)
Analysis of Deviance Table, Model: binomial, link: logit
Response: kyph, Terms added sequentially 

     Df Deviance Resid. Df Resid. Dev
NULL                    80     83.234
Acat  3    7.394        77     75.841
Ncat  2    9.458        75     66.383
Scat  2   12.269        73     54.114

## This makes the age-category least useful:
> anova(glm(kyph~ Scat + Ncat + Acat, family=binomial,
    data=kyphB))
> contrasts(kyphB$Acat)
  2 3 4
1 0 0 0
2 1 0 0
3 0 1 0
4 0 0 1	### Corresponds to "treatment" contrasts

> summary(glm(kyph~ Scat + Ncat + Acat, family=binomial,
    data=kyphB))$coef
            Estimate Std. Error z value Pr(>|z|)
(Intercept)  -2.5668     1.3515 -1.8992   0.0575
Acat2         2.4586     1.2558  1.9578   0.0503
Acat3         2.9454     1.2854  2.2915   0.0219
Acat4         2.2259     1.4075  1.5815   0.1138
NcatB         0.2269     0.8218  0.2762   0.7824
NcatC         1.7968     1.0479  1.7146   0.0864
Scatb        -1.2477     0.7537 -1.6555   0.0978
Scatc        -3.4125     1.1958 -2.8538   0.0043
### Thus there seems to be a monotone tendency in 
###    Age  (later categories have larger coefficients)
###    Ncat (larger Number implies larger coeff)
###    Scat (larger Start implies more neg coeff)

### With the exception of Scatc, all individual dummy-
###   coefficients are only borderline-significant 
###   at best. But glm using the numeric predictors 
###   does better.
> summary(glmB)$aic
[1] 70.11384  ### based on 8 fitted parameters, so 
### logLik = -0.5*(70.114 + 2*8) =  -27.057
### which agrees with 
> sum(kyphB$y*log(kyphB$fit) + (1-kyphB$y)*
         log(1-kyphB$fit))

> glmA <- glm(Kyphosis ~ Start + Number + Age, family=
   binomial, data=kyphA) ## AIC = 69.38, logLik= -31.69
> anova(glmA)
       Df Deviance Resid. Df Resid. Dev
NULL                      80     83.234
Start   1   15.162        79     68.072
Number  1    3.536        78     64.536
Age     1    3.157        77     61.380

### But in this glm it makes more sense to try 
###      interactions !
> glmA2 <-  glm(Kyphosis ~ Start*Number + Age, family=
   binomial, data=kyphA) 
#### Neither this nor the other 2nd order interactions seem
###   worthwhile ! But quadratic terms in Start and Age DO:
> glmA2 <-  glm(Kyphosis ~ Start+ I(Start^2) + Number + 
    Age + I(Age^2), family= binomial, data=kyphA) 
### AIC = 61.45497,  logLik = -25.7275
    #### This one seems to be the best model overall !!!
> anova(glmA2)
           Df Deviance Resid. Df Resid. Dev
NULL                          80     83.234
Start       1   15.162        79     68.072
I(Start^2)  1    6.373        78     61.699
Number      1    1.684        77     60.015
Age         1    3.072        76     56.942
I(Age^2)    1    7.487        75     49.455

### Residuals Pictures:
> plot(kyphA$Age, glmB$resid)
> identify(kyphA$Age, glmB$resid)
[1] 11 43 46 77 ### obs 11 is very large pos resid, obs 43 has 
### moderately large negative one, others moderate positiv.

> plot(kyphA$Age, glmA$resid)
> identify(kyphA$Age, glmA$resid)
[1] 10 11 23 40 43 46 77	### But 43 is now extreme negative
  ### residual, with 11 and 23 rather extreme positive ones !
> plot(kyphA$Age, glmA2$resid)

### Now the only very large residuals are observations 77(pos) 
and 43(neg), with 11 and 25 moderately large positive.
### plotting residuals against Number now shows only 77 and 43
###   very noticeable

> cbind.data.frame(obs=kyphA$Kyphosis, Afit=glmA$fit, 
   Bfit=glmB$fit, Age=kyphA$Age, Acat=kyphB$Acat, Start=
   kyphA$Start, No=kyphA$Number)[c(11,43,46,47, 25, 77),]
       obs      Afit       Bfit Age Acat Start No
11 present 0.1214575 0.03578569  82    2    14  5
43  absent 0.9309747 0.81090750 143    4     3  9
46 present 0.2057312 0.16958604 139    4    10  3
47  absent 0.1186181 0.05695073 136    3    15  4
25 present 0.6429469 0.31647929  15    1     2  7
77 present 0.1450918 0.16958604 157    4    13  3

### All but 11, 25, 43, 77 are very well fitted by glmA2.

### Taking further interaction-terms leads to surrogates 
###    for dummy-indicators for these individual 
###    observations: not advisable to do this !!!
=======================================================
OK, if this was how well we could do with GLM, how about
log-linear models ?? Here the data structure kyphB is the
only usable one, and the logLik values have completely 
different meanings !!

## So as preliminary: plot the predictors against one another
##    to check relationships !!
> plot(kyphA$Number, kyphA$Start)  ### etc.
> attach(kyphB)
  table(Scat, Ncat, Acat)
                  , , Acat = 1
    Ncat
Scat A B C
   a 1 2 1
   b 2 5 2
   c 7 2 0
		  , , Acat = 2
    Ncat
Scat A B C
   a 1 4 1
   b 1 3 2
   c 4 4 1
		  , , Acat = 3
    Ncat
Scat A B C
   a 2 2 1
   b 4 4 1
   c 5 2 0
		  , , Acat = 4
    Ncat
Scat A B C
   a 0 0 2
   b 2 4 0
   c 6 3 0

### Now begin with loglin model on these 
###    categories (in this order) plus kyph, 
###    allowing interactions up to second order 
###    with kyph and 3-way among the others

> lglinB <- loglin(table(Scat, Ncat, Acat, kyph),
  margin=list(1:3, c(1,4), c(2,4), c(3,4)), 
  param=T, fit=T)
5 iterations: deviation 0.06745102
> lglinB$lrt ### =  22.93809
> loglin(table(Scat, Ncat, Acat, kyph),
  margin=list(1:3, c(1,4), c(2,4)))$lrt ## 31.247
3 iterations: deviation 0.05945936 
> loglin(table(Scat, Ncat, Acat, kyph),
  margin=list(1:3, c(1,4), c(3,4)))$lrt ## 26.709
4 iterations: deviation 0.02596825 
> loglin(table(Scat, Ncat, Acat, kyph),
  margin=list(1:3, c(2,4), c(3,4)))$lrt ## 35.206
4 iterations: deviation 0.06688744 

### Consider the lrt calculation(in lglinB):
> Ctab <- table(Scat, Ncat, Acat, kyph)
  2*sum(ifelse(Ctab>0,Ctab*log(Ctab/lglinB$fit),0))
[1] 22.938
### This says the lrt number is exactly the deviance 
###     statistic. A reasonable `NULL' model is one
###     which is saturated wrt the first 3 responses
###     and indep wrt the 4th.
> loglin(table(Scat, Ncat, Acat, kyph),
  margin=list(1:3, c(1,4)))$lrt ## 33.949
2 iterations: deviation 8.881784e-16 
> loglin(table(Scat, Ncat, Acat, kyph),
+   margin=list(1:3, 4))$lrt ## 52.056
2 iterations: deviation 7.105427e-15 

### The loglik for the saturated model is:
> sum(ifelse(Ctab>0,Ctab*log(Ctab/81),0))### -277.347

## So we have the following hierarchy, with variables:
###  X=Scat, Z=Ncat, W=Acat, Y=kyph

###      Model            Deviance      logLik
###    Saturated            0		-277.347
###   XZW, XY, ZY, WY      22.938       -288.816
###   XZW, XY, WY          26.709       -290.702
###   XZW, XY, ZY          31.247       -292.976
###   XZW, WY, ZY          35.206       -294.956
###   XZW, XY              33.949       -293.627
###   XZW, Y               52.056       -303.881

## The decrement to deviance due to the Scat variable
##    is:  52.056 - 33.949 = 18.107
### Agrees exactly with:
> anova(glm(kyph ~ Scat, family=binomial)) 

----------------------------------------------------
SAS Code follows:
----------------------------------------------------

data kyphB (drop= Age Number Start Id);
  infile "kyphosis";
  input id kyph $ Age Number Start;
  if Age < 96 then if Age < 31 then Acat=1; 
     else Acat=2;
  else if Age <137 then Acat=3; else Acat=4;
  if Number < 4 then Ncat="A";
     else if Number > 5 then Ncat="C"; else Ncat="B";
  if Start < 8 then Scat="a"; 
     else if Start > 13 then Scat="c"; else Scat="b";
  if id ne . ;
run;
proc sort;
  by kyph Acat Ncat Scat;
  data kyphC;
     set kyphB;
     by kyph Acat Ncat Scat;
     if first.Scat then count=0;
     count+1;
     if last.Scat then output;

    proc genmod data=kyphB desc;
        class kyph Acat Ncat Scat ;
        model kyph = Scat Ncat Acat / dist=bin
            link=logit type3;
/* This is the same model as glmB in Splus above */

proc print;
 run;

Obs     kyph      Acat    Ncat    Scat    count
  1    absent       1      A       a        1
  2    absent       1      A       b        2
  3    absent       1      A       c        7
  4    absent       1      B       a        2
  5    absent       1      B       b        5
  6    absent       1      B       c        2
  7    absent       1      C       b        2
  8    absent       2      A       a        1
  9    absent       2      A       b        1
 10    absent       2      A       c        4
 11    absent       2      B       a        2
 12    absent       2      B       b        2
 13    absent       2      B       c        3
 14    absent       2      C       b        1
 15    absent       2      C       c        1
 16    absent       3      A       a        1
 17    absent       3      A       b        3
 18    absent       3      A       c        5
 19    absent       3      B       b        3
 20    absent       3      B       c        2
 21    absent       4      A       c        6
 22    absent       4      B       b        4
 23    absent       4      B       c        3
 24    absent       4      C       a        1
 25    present      1      C       a        1
 26    present      2      B       a        2
 27    present      2      B       b        1
 28    present      2      B       c        1
 29    present      2      C       a        1
 30    present      2      C       b        1
 31    present      3      A       a        1
 32    present      3      A       b        1
 33    present      3      B       a        2
 34    present      3      B       b        1
 35    present      3      C       a        1
 36    present      3      C       b        1
 37    present      4      A       b        2
 38    present      4      C       a        1

proc catmod data=kyphC;
  weight count;
  model kyph*Acat*Ncat*Scat = _response_ / 
     ml nodesign noresponse noiter noprofile; 
  loglin Acat|Ncat|Scat kyph kyph*Acat kyph*Ncat kyph*Scat ;
run;
/* The output gives ml estimates and SE for each of the 
    cross-classified coefficients and is nearly useless! */