---------------------------------------------------------- CREATION OF DATASET and Case Study 3/5/04 ---------------------------------------------------------- We want a model to optimize over in a standard ("logistic regression") setting which is MISSPECIFIED. The model: y_i ~ binom(1, g(a + dmat %*% b)) where g(x) = pnorm(sgn(x)*abs(x)^1.2). > dmat <- cbind(rbinom(300,1,0.3), rnorm(300,0,2), rexp(300)/2) gfcn <- function(x) sign(x)*(abs(x)^1.2) scors <- gfcn( 0.25 + dmat %*% c(0.2, .1, -.34) ) summary(scors) Min. 1st Qu. Median Mean 3rd Qu. Max. -0.4998004 -0.0070889 0.0996329 0.1197654 0.2720349 0.8168634 > ydat <- rbinom(300,1,pnorm(scors)) > outmat <- cbind(round(dmat,6), ydat) outmat[1:10,] ydat [1,] 0 2.210028 0.757475 0 [2,] 0 -1.005519 0.238525 1 [3,] 1 -3.344933 0.263189 0 [4,] 0 -0.835836 0.496466 1 [5,] 1 -0.413936 0.024207 1 [6,] 1 1.127404 0.898510 1 [7,] 0 3.612828 0.037393 1 [8,] 1 0.035434 0.251841 0 [9,] 1 -1.417317 0.040870 1 [10,] 0 -2.758967 0.618358 1 > write(t(outmat),file="/home1/evs/public_html/s798c/Data/Dset3C.asc", ncolumns=4) ------------------------- Likelihood Maximization ------------------------- ### True beta is (0.25,0.2, 0.1, -0.34) > beta0 <- c(0.25,0.2, 0.1, -0.34) dfram <- data.frame(outmat) names(dfram) <- c("x1","x2","x3","y) > glmtmp <- glm(cbind(y,1-y) ~ . , family=binomial, data=dfram) > glmtmp Call: glm(formula = cbind(y, 1 - y) ~ ., family = binomial, data = dfram) Coefficients: (Intercept) x1 x2 x3 0.3894684 0.427663 0.1409899 -0.6862191 Degrees of Freedom: 300 Total; 296 Residual Residual Deviance: 398.6474 > lLikC <- function(beta) { pv <- plogis(beta[1] + data.matrix(dfram[,1:3]) %*% beta[2:4]) sum(dfram[,4]*log(pv)+ (1-dfram[,4])*log(1-pv)) } lLikC(glmtmp$coef) [1] -199.3237 > tmpmax <- nlmin(function(b) -lLikC(b), rep(0,4), print.lev=1) it nf f reldf preldf reldx 0 1 2.079e2 1 3 2.048e2 1.490e-2 3.899e-2 1.000e0 2 4 2.029e2 9.268e-3 1.296e-2 4.633e-1 3 5 2.013e2 8.278e-3 1.174e-2 3.900e-1 4 6 2.004e2 4.477e-3 8.827e-3 2.290e-1 5 7 2.002e2 1.069e-3 8.100e-3 1.990e-1 6 8 1.998e2 1.705e-3 7.395e-3 1.664e-1 7 10 1.994e2 2.149e-3 3.573e-3 4.627e-2 8 11 1.993e2 1.920e-4 5.725e-4 4.521e-2 9 12 1.993e2 4.783e-5 1.330e-4 2.981e-2 10 15 1.993e2 2.854e-5 4.605e-5 4.183e-3 11 16 1.993e2 1.361e-5 1.620e-5 4.337e-3 12 18 1.993e2 1.157e-5 1.251e-5 9.334e-3 13 20 1.993e2 1.932e-7 3.051e-7 1.145e-3 14 21 1.993e2 7.240e-9 7.974e-9 1.147e-4 15 22 1.993e2 6.259e-11 6.260e-11 1.033e-5 ***** relative function convergence ***** function 1.993237e2 reldx 1.0333e-5 function evaluations: 22 gradient evaluations: 77 preldf 6.2598e-11 npreldf 6.2598e-11 > tmpmax $x: [1] 0.3894684 0.4276630 0.1409899 -0.6862191 $converged: [1] T $conv.type: [1] "relative function convergence" > pv <- plogis(tmpmax$x[1] + data.matrix(dfram[,1:3]) %*% tmpmax$x[2:4]) > names(glmtmp) [1] "coefficients" "residuals" "fitted.values" [4] "effects" "R" "rank" [7] "assign" "df.residual" "weights" [10] "prior.weights" "family" "linear.predictors" [13] "deviance" "null.deviance" "call" [16] "iter" "y" "contrasts" [19] "terms" "formula" "control" ============================================================== ### Goodness of fit ? What next ? ### RESUME HERE 3/8 > apply(dfram[,1:3],2,summary) x1 x2 x3 Min. 0.0000 -6.44802500 0.0006170 1st Qu. 0.0000 -1.06085650 0.1530300 Median 0.0000 0.03665400 0.3683740 Mean 0.3033 0.07532922 0.5098403 3rd Qu. 1.0000 1.43638525 0.7484550 Max. 1.0000 5.28844700 2.5357340 > summary(glmtmp)$coef Value Std. Error t value (Intercept) 0.3894684 0.1919609 2.028895 x1 0.4276630 0.2623609 1.630056 x2 0.1409899 0.0634357 2.222564 x3 -0.6862191 0.2534120 -2.707918 > sqrt(diag(summary(glmtmp)$cov.un)) [1] 0.1919609 0.2623609 0.0634357 0.2534120 > round(summary(glmtmp)$cov.un,6) (Intercept) x1 x2 x3 (Intercept) 0.036849 -0.020813 0.000191 -0.032944 x1 -0.020813 0.068833 0.000965 0.001569 x2 0.000191 0.000965 0.004024 -0.001176 x3 -0.032944 0.001569 -0.001176 0.064218 ### This is the inverted "empirical information" matrix ### We verify this using the formula Iobs = ### sum t*(dfram[i,]) dfram[i,] dlogis(glmtmp$coef %*% dfram[i,]) > dmat <- cbind(rep(1,300),data.matrix(dfram[,1:3])) round(solve(t(dmat) %*% (dlogis(c(dmat %*% glmtmp$coef))*dmat)),6) x1 x2 x3 0.036850 -0.020813 0.000191 -0.032947 x1 -0.020813 0.068837 0.000965 0.001568 x2 0.000191 0.000965 0.004024 -0.001177 x3 -0.032947 0.001568 -0.001177 0.064223 ### Now for some "Goodness of Fit" Checks: (O) Plot showing comparison between 2 different "link" functions > xt <- seq(-4,4,.1) plot(xt, plogis(xt), type="n", xlab="GLM score", ylab="Probability Distribution", main= "Plots of Actual vs `Working' D.F. Model") lines(xt, plogis(xt), lty=3) lines(xt, pnorm(gfcn(xt))) printgraph(file="ModlsDsetC.ps") (O') Also note the comparison between true and estimated coefficients : > rbind(beta0, fitted=glmtmp$coef, s.dev=summary(glmtmp)$coef[,2]) (Intercept) x1 x2 x3 beta0 0.2500000 0.2000000 0.1000000 -0.3400000 fitted 0.3894684 0.4276630 0.1409899 -0.6862191 s.dev 0.1919609 0.2623609 0.0634357 0.2534120 NOTE: since we will find throughout that we pass all goodness- of-fit checks, it seems that the data are letting us conclude essentially that they behave "as if" they were simulated, so we should want the logistic-regression estimated coefficients estimated to be rather close to the true beta0. BUT: as a student pointed out, this is not strictly logical unless we show for this pair of "true"and "working" models that the theta_star vector (the large-sample limiting value of the logistic-model MLE under the pnorm(gfcn) model is in fact close to beta0. We now do that at the end of this Log-file, under paragraph (IV). (I) Comparison of 2 ways to calculate Iobs ### The first is the one inverted above: > Iobs1 <- t(dmat) %*% (dlogis(c(dmat %*% glmtmp$coef))*dmat) > round(Iobs1,4) x1 x2 x3 70.7929 20.5567 2.1942 35.8554 x1 20.5567 20.5567 -2.9857 9.9891 x2 2.1942 -2.9857 250.7962 5.7936 x3 35.8554 9.9891 5.7936 33.8269 > Iobs2 <- t(dmat) %*% ((dfram[,4] - plogis(c(dmat %*% glmtmp$coef)))^2*dmat) > round(Iobs2,4) x1 x2 x3 70.8330 20.5505 5.1217 34.4568 x1 20.5505 20.5505 -3.1552 9.6791 x2 5.1217 -3.1552 254.7686 7.8950 x3 34.4568 9.6791 7.8950 30.8615 #### Remarkably close !!! (II) Likelihoods. > sum(dfram[,4]*log(plogis(c(dmat %*% glmtmp$coef))) + (1-dfram[,4])*log(1-plogis(c(dmat %*% glmtmp$coef)))) [1] -199.3237 ### This was the maximized objective ### function above. > lLikC2 <- function(beta) { pv <- pnorm(gfcn(dmat %*% beta)) sum(dfram[,4]*log(pv)+ (1-dfram[,4])*log(1-pv)) } ### This is the logLik for true model [1] -200.7786 ### This is the logLik for true model and > lLikC2(glmtmp$coef) [1] -200.7786 ### using fitted coefficients > lLikC2(beta0) [1] -199.9061 ### logLik for true model, true coeff's > tmpmax2 <- nlminb(rep(0,4), function(b) -lLikC2(b))> tmpmax2[1:3] $parameters: [1] 0.2860430 0.2967455 0.1022324 -0.5276765 $objective: [1] 198.8974 ### So best logLik = -198.90 $message: [1] "RELATIVE FUNCTION CONVERGENCE" (III) Goodness of fit statistics: (A) > sum( (dfram[,4] - plogis(c(dmat %*% glmtmp$coef)))^2 )/ sum( dlogis(c(dmat %*% glmtmp$coef)) ) [1] 1.000568 ### Ratio of variances SHOULD be 1 ! > sum( (dfram[,4] - plogis(c(dmat %*% glmtmp$coef)))^2 / dlogis(c(dmat %*% glmtmp$coef)) ) [1] 299.6247 ### Fine for chi-squared 300 df! > mtabl <- table(dfram[,1], dfram[,2] > 0, dfram[,3] > .4, dfram[,4]) mtabl ### These are the observed counts , , FALSE, 0 FALSE TRUE 0 26 23 1 12 9 , , TRUE, 0 FALSE TRUE 0 23 30 1 11 3 , , FALSE, 1 FALSE TRUE 0 30 29 1 18 15 , , TRUE, 1 FALSE TRUE 0 21 27 1 8 15 > etabl <- aggregate.data.frame(plogis(c(dmat %*% glmtmp$coef)) , by= list(dfram[,1], dfram[,2] > 0, dfram[,3] > .4), sum) > etabl Group.1 Group.2 Group.3 x 1 0 FALSE FALSE 28.604299 2 1 FALSE FALSE 18.665996 3 0 TRUE FALSE 32.361965 4 1 TRUE FALSE 17.091907 5 0 FALSE TRUE 17.663440 6 1 FALSE TRUE 8.996245 7 0 TRUE TRUE 28.370296 8 1 TRUE TRUE 11.245852 > sum((etabl[,4]-mtabl[,,,2])^2/etabl[,4]) [1] 2.75716 ### chi-square gof for 8-4 df ### Should actually have calculated: > sum((etabl[,4]-mtabl[,,,2])^2/(etabl[,4]*(1-etabl[,4]/ (mtabl[,,,1]+mtabl[,,,2])))) [1] 6.750597 #### chi-square on 8-4 df (IV) Calculation of theta_star: recall from class (3/10/04) that theta_star is characterized as the maximizer of the integral under true density of the log of the working density: we define the Kullback Leibler functional of the two densities as a function of beta, and maximize it. NB. At this point dmat was discarded, so we simulate a new matrix of this type and re-calculate, which is OK since we are only estimaating analytical integrals !! > dmat0 <- cbind(rep(1,3000), rbinom(3000,1,0.3), rnorm(3000,0,2), rexp(3000)/2) > pvtrue <- plogis(dmat0 %*% beta0) KLfunc <- function(beta) { pvwork <- pnorm(gfcn(dmat0 %*% beta)) -mean(pvtrue*log(pvwork) + (1-pvtrue)*log(1-pvwork)) } nlmin(KLfunc,beta0) $x: [1] 0.19829751 0.14870286 0.07590155 -0.26249871 $converged: [1] T $conv.type: [1] "relative function convergence" > rbind(.Last.value$x, beta0, fitted=glmtmp$coef, s.dev=summary(glmtmp)$coef[,2]) (Intercept) x1 x2 x3 0.1982975 0.1487029 0.0759016 -0.2624987 ## theta_star beta0 0.2500000 0.2000000 0.1000000 -0.3400000 fitted 0.3894684 0.4276630 0.1409899 -0.6862191 s.dev 0.1919609 0.2623609 0.0634357 0.2534120