----------------------------------------------------------
CREATION OF DATASET and Case Study              3/5/04
----------------------------------------------------------

We want a model to optimize over in a standard ("logistic 
regression") setting which is MISSPECIFIED.

The model: y_i ~ binom(1, g(a + dmat %*% b))
where  g(x) = pnorm(sgn(x)*abs(x)^1.2).

> dmat <- cbind(rbinom(300,1,0.3), rnorm(300,0,2), rexp(300)/2)
  gfcn <- function(x) sign(x)*(abs(x)^1.2)
  scors <- gfcn( 0.25 + dmat %*% c(0.2, .1, -.34) ) 
  summary(scors)
       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
 -0.4998004 -0.0070889  0.0996329  0.1197654  0.2720349  0.8168634

> ydat <- rbinom(300,1,pnorm(scors))

> outmat <- cbind(round(dmat,6), ydat)
  outmat[1:10,]
                           ydat 
 [1,] 0  2.210028 0.757475    0
 [2,] 0 -1.005519 0.238525    1
 [3,] 1 -3.344933 0.263189    0
 [4,] 0 -0.835836 0.496466    1
 [5,] 1 -0.413936 0.024207    1
 [6,] 1  1.127404 0.898510    1
 [7,] 0  3.612828 0.037393    1
 [8,] 1  0.035434 0.251841    0
 [9,] 1 -1.417317 0.040870    1
[10,] 0 -2.758967 0.618358    1

> write(t(outmat),file="/home1/evs/public_html/s798c/Data/Dset3C.asc",
     ncolumns=4)

-------------------------
Likelihood Maximization
-------------------------

### True beta is (0.25,0.2, 0.1, -0.34)

> beta0 <- c(0.25,0.2, 0.1, -0.34)
  dfram <- data.frame(outmat)
  names(dfram) <- c("x1","x2","x3","y)
> glmtmp <- glm(cbind(y,1-y) ~ . , family=binomial, 
      data=dfram)
> glmtmp
Call:
glm(formula = cbind(y, 1 - y) ~ ., family = binomial, data = dfram)

Coefficients:
 (Intercept)       x1        x2         x3 
   0.3894684 0.427663 0.1409899 -0.6862191

Degrees of Freedom: 300 Total; 296 Residual
Residual Deviance: 398.6474 

> lLikC <- function(beta) { 
              pv <- plogis(beta[1] + data.matrix(dfram[,1:3]) %*%
                  beta[2:4])
              sum(dfram[,4]*log(pv)+ (1-dfram[,4])*log(1-pv)) }
  lLikC(glmtmp$coef) 
[1] -199.3237
> tmpmax <- nlmin(function(b) -lLikC(b), rep(0,4), print.lev=1)
    it    nf      f       reldf      preldf     reldx
     0     1  2.079e2
     1     3  2.048e2    1.490e-2   3.899e-2   1.000e0
     2     4  2.029e2    9.268e-3   1.296e-2   4.633e-1
     3     5  2.013e2    8.278e-3   1.174e-2   3.900e-1
     4     6  2.004e2    4.477e-3   8.827e-3   2.290e-1
     5     7  2.002e2    1.069e-3   8.100e-3   1.990e-1
     6     8  1.998e2    1.705e-3   7.395e-3   1.664e-1
     7    10  1.994e2    2.149e-3   3.573e-3   4.627e-2
     8    11  1.993e2    1.920e-4   5.725e-4   4.521e-2
     9    12  1.993e2    4.783e-5   1.330e-4   2.981e-2
    10    15  1.993e2    2.854e-5   4.605e-5   4.183e-3
    11    16  1.993e2    1.361e-5   1.620e-5   4.337e-3
    12    18  1.993e2    1.157e-5   1.251e-5   9.334e-3
    13    20  1.993e2    1.932e-7   3.051e-7   1.145e-3
    14    21  1.993e2    7.240e-9   7.974e-9   1.147e-4
    15    22  1.993e2    6.259e-11  6.260e-11  1.033e-5
 ***** relative function convergence *****
 function     1.993237e2    reldx        1.0333e-5
 function evaluations:  22   gradient evaluations:  77
 preldf       6.2598e-11  npreldf      6.2598e-11
> tmpmax
$x:
[1]  0.3894684  0.4276630  0.1409899 -0.6862191

$converged:
[1] T

$conv.type:
[1] "relative function convergence"

> pv <-  plogis(tmpmax$x[1] + data.matrix(dfram[,1:3]) %*%
                  tmpmax$x[2:4])
> names(glmtmp)
 [1] "coefficients"      "residuals"         "fitted.values"    
 [4] "effects"           "R"                 "rank"             
 [7] "assign"            "df.residual"       "weights"          
[10] "prior.weights"     "family"            "linear.predictors"
[13] "deviance"          "null.deviance"     "call"             
[16] "iter"              "y"                 "contrasts"        
[19] "terms"             "formula"           "control"         

==============================================================
### Goodness of fit ? What next ? ### RESUME HERE 3/8

> apply(dfram[,1:3],2,summary)
            x1          x2        x3 
   Min. 0.0000 -6.44802500 0.0006170
1st Qu. 0.0000 -1.06085650 0.1530300
 Median 0.0000  0.03665400 0.3683740
   Mean 0.3033  0.07532922 0.5098403
3rd Qu. 1.0000  1.43638525 0.7484550
   Max. 1.0000  5.28844700 2.5357340

> summary(glmtmp)$coef
                 Value Std. Error   t value 
(Intercept)  0.3894684  0.1919609  2.028895
         x1  0.4276630  0.2623609  1.630056
         x2  0.1409899  0.0634357  2.222564
         x3 -0.6862191  0.2534120 -2.707918

> sqrt(diag(summary(glmtmp)$cov.un))
[1] 0.1919609 0.2623609 0.0634357 0.2534120
> round(summary(glmtmp)$cov.un,6)
            (Intercept)        x1        x2        x3 
(Intercept)    0.036849 -0.020813  0.000191 -0.032944
         x1   -0.020813  0.068833  0.000965  0.001569
         x2    0.000191  0.000965  0.004024 -0.001176
         x3   -0.032944  0.001569 -0.001176  0.064218

### This is the inverted "empirical information" matrix
### We verify this using the formula Iobs = 
### sum t*(dfram[i,]) dfram[i,] dlogis(glmtmp$coef %*% dfram[i,])

> dmat <- cbind(rep(1,300),data.matrix(dfram[,1:3]))
  round(solve(t(dmat) %*% (dlogis(c(dmat %*% glmtmp$coef))*dmat)),6)
                    x1        x2        x3 
    0.036850 -0.020813  0.000191 -0.032947
x1 -0.020813  0.068837  0.000965  0.001568
x2  0.000191  0.000965  0.004024 -0.001177
x3 -0.032947  0.001568 -0.001177  0.064223

### Now for some "Goodness of Fit" Checks:

(O) Plot showing comparison between 2 different "link" functions
> xt <- seq(-4,4,.1)
  plot(xt, plogis(xt), type="n", xlab="GLM score",
    ylab="Probability Distribution", main=
    "Plots of Actual vs `Working' D.F. Model")
  lines(xt, plogis(xt), lty=3)
  lines(xt, pnorm(gfcn(xt)))
  printgraph(file="ModlsDsetC.ps")

(O') Also note the comparison between true and 
estimated coefficients :
> rbind(beta0, fitted=glmtmp$coef, s.dev=summary(glmtmp)$coef[,2])
      (Intercept)        x1        x2         x3 
 beta0   0.2500000 0.2000000 0.1000000 -0.3400000
fitted   0.3894684 0.4276630 0.1409899 -0.6862191
 s.dev   0.1919609 0.2623609 0.0634357  0.2534120

NOTE: since we will find throughout that we pass all goodness-
    of-fit checks, it seems that the data are letting us conclude
    essentially that they behave "as if" they were simulated, so we
    should want the logistic-regression estimated coefficients
    estimated to be rather close to the true beta0.

BUT: as a student pointed out, this is not strictly logical 
    unless we show for this pair of "true"and "working" models that
    the theta_star vector (the large-sample limiting value of 
    the logistic-model MLE under the pnorm(gfcn) model is in 
    fact close to beta0. We now do that at the end of this 
    Log-file, under paragraph (IV).

(I) Comparison of 2 ways to calculate Iobs
### The first is the one inverted above:
> Iobs1 <- t(dmat) %*% (dlogis(c(dmat %*% glmtmp$coef))*dmat)
> round(Iobs1,4)
                 x1        x2      x3 
   70.7929  20.5567    2.1942 35.8554
x1 20.5567  20.5567   -2.9857  9.9891
x2  2.1942  -2.9857  250.7962  5.7936
x3 35.8554   9.9891    5.7936 33.8269

> Iobs2 <- t(dmat) %*% ((dfram[,4] - plogis(c(dmat %*% 
      glmtmp$coef)))^2*dmat)
> round(Iobs2,4)
                 x1        x2      x3 
   70.8330  20.5505    5.1217 34.4568
x1 20.5505  20.5505   -3.1552  9.6791
x2  5.1217  -3.1552  254.7686  7.8950
x3 34.4568   9.6791    7.8950 30.8615
#### Remarkably close !!!

(II) Likelihoods.

> sum(dfram[,4]*log(plogis(c(dmat %*% glmtmp$coef))) + 
  (1-dfram[,4])*log(1-plogis(c(dmat %*% glmtmp$coef))))
[1] -199.3237   ### This was the maximized objective 
###  function above.

> lLikC2 <- function(beta) { 
              pv <- pnorm(gfcn(dmat %*% beta))
              sum(dfram[,4]*log(pv)+ (1-dfram[,4])*log(1-pv)) }
### This is the logLik for true model 

[1] -200.7786  ### This is the logLik for true model and
> lLikC2(glmtmp$coef)
[1] -200.7786		### using fitted coefficients

> lLikC2(beta0)
[1] -199.9061         ### logLik for true model, true coeff's

> tmpmax2 <- nlminb(rep(0,4), function(b) -lLikC2(b))> tmpmax2[1:3]
$parameters:
[1]  0.2860430  0.2967455  0.1022324 -0.5276765

$objective:
[1] 198.8974          ### So best logLik = -198.90

$message:
[1] "RELATIVE FUNCTION CONVERGENCE"

(III) Goodness of fit statistics:

(A) 
> sum( (dfram[,4] - plogis(c(dmat %*% glmtmp$coef)))^2 )/
  sum( dlogis(c(dmat %*% glmtmp$coef)) ) 
[1] 1.000568		 ### Ratio of variances SHOULD be 1 !

> sum( (dfram[,4] - plogis(c(dmat %*% glmtmp$coef)))^2 /
     dlogis(c(dmat %*% glmtmp$coef)) ) 
[1] 299.6247		     ### Fine for chi-squared 300 df!

> mtabl <- table(dfram[,1], dfram[,2] > 0, dfram[,3] > .4, dfram[,4])
  mtabl
		### These are the observed counts
, , FALSE, 0
  FALSE TRUE 
0    26   23
1    12    9

, , TRUE, 0
  FALSE TRUE 
0    23   30
1    11    3

, , FALSE, 1
  FALSE TRUE 
0    30   29
1    18   15

, , TRUE, 1
  FALSE TRUE 
0    21   27
1     8   15

> etabl <- aggregate.data.frame(plogis(c(dmat %*% 
    glmtmp$coef)) , by=
    list(dfram[,1], dfram[,2] > 0, dfram[,3] > .4), sum)
> etabl
  Group.1 Group.2 Group.3         x 
1       0   FALSE   FALSE 28.604299
2       1   FALSE   FALSE 18.665996
3       0    TRUE   FALSE 32.361965
4       1    TRUE   FALSE 17.091907
5       0   FALSE    TRUE 17.663440
6       1   FALSE    TRUE  8.996245
7       0    TRUE    TRUE 28.370296
8       1    TRUE    TRUE 11.245852

> sum((etabl[,4]-mtabl[,,,2])^2/etabl[,4])
[1] 2.75716  ### chi-square gof for 8-4 df 

### Should actually have calculated:
> sum((etabl[,4]-mtabl[,,,2])^2/(etabl[,4]*(1-etabl[,4]/
      (mtabl[,,,1]+mtabl[,,,2]))))
[1] 6.750597		#### chi-square on 8-4 df

(IV) Calculation of theta_star:

recall from class (3/10/04) that theta_star is characterized as 
the maximizer of the integral under true density of the log of 
the working density: we define the Kullback Leibler functional
of the two densities as a function of beta, and maximize it.  

NB. At this point dmat was discarded, so we simulate a new matrix of
this type and re-calculate, which is OK since we are only estimaating
analytical integrals !!
>  dmat0 <- cbind(rep(1,3000), rbinom(3000,1,0.3), rnorm(3000,0,2),
       rexp(3000)/2)
>  pvtrue <- plogis(dmat0 %*% beta0)
   KLfunc <- function(beta) {
      pvwork <- pnorm(gfcn(dmat0 %*% beta))
      -mean(pvtrue*log(pvwork) + (1-pvtrue)*log(1-pvwork)) }
   nlmin(KLfunc,beta0)
$x:
[1]  0.19829751  0.14870286  0.07590155 -0.26249871

$converged:
[1] T

$conv.type:
[1] "relative function convergence"

> rbind(.Last.value$x, beta0, fitted=glmtmp$coef, 
           s.dev=summary(glmtmp)$coef[,2])
      (Intercept)        x1        x2         x3 
         0.1982975 0.1487029 0.0759016 -0.2624987  ## theta_star
 beta0   0.2500000 0.2000000 0.1000000 -0.3400000
fitted   0.3894684 0.4276630 0.1409899 -0.6862191
 s.dev   0.1919609 0.2623609 0.0634357  0.2534120