Homework Solutions Logs. ======================== (1) > V1 <- rep(c(0,1,1),40) V2 <- sin(2*pi*sqrt(1:120)) ### A better version (used below) would be: ### V2 <- round(sin(2*pi*sqrt(1:120)),1e-12) V3 <- ifelse(V2>=0, "H", "L") newmat <- cbind(Group = V1, Y = V2, Type = V3) newfram <- cbind.data.frame(Group = V1, Y = V2, Type = V3, stringsAsFactors=F) ### Note that the "stringsAsFactors=F" optional argument is ### needed to avoid automatic conversion of character data ### to factor-type. > unlist(lapply(newfram,class)) Group Y Type "integer" "numeric" "character" > newfram[1:5,] Group Y Type 1 0 0 H 2 1 1 H 3 1 -1 L 4 0 0 H 5 1 1 H > newmat[1:5,] Group Y Type [1,] "0" "0" "H" [2,] "1" "1" "H" [3,] "1" "-1" "L" [4,] "0" "0" "H" [5,] "1" "1" "H" (2) > exmpfram <- read.table(file="exampfram.txt", stringsAsFactors=F, skip=1) > dim(exmpfram) [1] 216 13 > exmpfram <- exmpfram[,3:9] > names(exmpfram) <- c("IDNUM","DTH","EVTTIME","TRTGP","LOGBILI", "AGEVAR","CIRRH") > exmpfram$CIRRH <- exmpfram$CIRRH == "T" > class(exmpfram$CIRRH) [1] "logical" > exmpfram2 <- scan(file="exampfram.txt", skip=1) ### This is a giant vector, of type "character" and length 216*13=2808 ### We are interested in pulling out the elements 3:8 in each row, ### which now correspond to elements with indices 3:8 plus ### multiples 0:215 of 13. > inds <- rep(3:9,216) + rep(13*(0:215),rep(7,216)) inds2 <- c(outer(3:9, 13*(0:215), "+")) > sum(abs(inds-inds2)) [1] 0 > exmpfram2 <- data.frame(matrix(exmpfram2[inds], ncol=7, byrow=T, dimnames=list(NULL, names(exmpfram))), stringsAsFactors=F) > exmpfram2[,7] <- as.logical(exmpfram2[,7]) > for(i in c(1:3,5:6)) exmpfram2[,i] <- as.numeric(exmpfram2[,i]) > unlist(lapply(exmpfram2,class)) IDNUM DTH EVTTIME TRTGP LOGBILI AGEVAR CIRRH "numeric" "numeric" "numeric" "character" "numeric" "numeric" "logical" > c(sum(exmpfram == exmpfram2), sum(unlist(lapply(exmpfram2,class))== unlist(lapply(exmpfram,class)))) [1] 1512 7 ### solves part (c), since all 216*7 entries match (including type) > t(apply(exmpfram[,c(1:3,5:6)],2,range)) ### solution to part (d) [,1] [,2] IDNUM 1.00000 248.00000 DTH 0.00000 1.00000 EVTTIME 0.00000 139.44000 LOGBILI 0.69897 2.72263 AGEVAR 1.64900 330.30000 ### same matrix is produced by: > cbind(apply(exmpfram[,c(1:3,5:6)],2,min), apply(exmpfram[,c(1:3,5:6)],2,max)) (3) (a) > matcols <- function(xvec, outcols) { outmat <- cbind(rep(1,length(xvec)), xvec) if(outcols > 1) for(i in 2:outcols) outmat <- cbind(outmat, xvec^i) outmat } > matcols(1:3,3) xvec [1,] 1 1 1 1 [2,] 1 2 4 8 [3,] 1 3 9 27 > matcols(2:5,1) xvec [1,] 1 2 [2,] 1 3 [3,] 1 4 [4,] 1 5 (b) > convtype <- function(infram, charac=NULL, numer=NULL, bool=NULL){ tmpfram <- infram if(!is.null(charac) && charac == "factor") { charcols <- unlist(lapply(tmpfram, is.character)) if(sum(charcols)) for(i in (1:ncol(tmpfram))[charcols]) tmpfram[,i] <- factor(tmpfram[,i]) } if(!is.null(numer) && numer == "charac") { numcols <- unlist(lapply(tmpfram, is.numeric)) if(sum(numcols)) for(i in (1:ncol(tmpfram))[numcols]) tmpfram[,i] <- as.character(tmpfram[,i]) } if(!is.null(bool) && (bool=="charac" | bool=="numer")) { boolcols <- unlist(lapply(tmpfram, is.logical)) if(bool=="charac") for(i in (1:ncol(tmpfram))[boolcols]) tmpfram[,i] <- as.character(tmpfram[,i]) else for(i in (1:ncol(tmpfram))[boolcols]) tmpfram[,i] <- as.numeric(tmpfram[,i]) } tmpfram } > auxfram <- cbind.data.frame(factor(as.character(1:10)), rep(c(3,5),5), rep(c(3,5),5) < 4, letters[12:21], stringsAsFactors=F) > auxfram X1 X2 X3 X4 1 1 3 T l 2 2 5 F m 3 3 3 T n 4 4 5 F o 5 5 3 T p 6 6 5 F q 7 7 3 T r 8 8 5 F s 9 9 3 T t 10 10 5 F u > unlist(lapply(auxfram,class)) X1 X2 X3 X4 "factor" "integer" "logical" "character" > convtype(auxfram, charac="factor") X1 X2 X3 X4 1 1 3 T l 2 2 5 F m 3 3 3 T n 4 4 5 F o 5 5 3 T p 6 6 5 F q 7 7 3 T r 8 8 5 F s 9 9 3 T t 10 10 5 F u > unlist(lapply(.Last.value,class)) X1 X2 X3 X4 "factor" "integer" "logical" "factor" > convtype(auxfram, numer="charac", bool="numer") X1 X2 X3 X4 1 1 3 1 l 2 2 5 0 m 3 3 3 1 n 4 4 5 0 o 5 5 3 1 p 6 6 5 0 q 7 7 3 1 r 8 8 5 0 s 9 9 3 1 t 10 10 5 0 u > unlist(lapply(.Last.value,class)) X1 X2 X3 X4 "factor" "character" "numeric" "character" (4) (a) > SimLin <- function(errsd, coef, xvec=NULL, simx = NULL) { ### NB: required arguments are: ### errsd = error standard deviation ### coef = vector with 2 components (intercept,slope) ### Either vector xvec or sim=(mean,sd,length) ### parameter vector must be given non-null if(is.null(xvec)) xvec <- rnorm(simx[3],simx[1],simx[2]) list( xvec = xvec, coef = coef, errsd = errsd, simx = simx, yvec =coef[1]+coef[2]*xvec+rnorm(length(xvec))*errsd) } ### eg > SimLin(0.5, c(-1,1), 1:5) -> slst1 > round(slst1$yvec,3) [1] -0.010 0.494 2.458 2.309 3.765 (b) > slst2 <- SimLin(0.5, c(-1,1), xvec=runif(1000)) slst3 <- SimLin(1, c(2,.2), simx= c(0,1,1000)) ## For first case, show the numbers of xvec obs in each interval ## (k/10, (k+1)/10], and theoretical and observed mean, variance, ## covariance among xvec and yvec values > table(floor(slst2$xvec*10)) 0 1 2 3 4 5 6 7 8 9 110 93 104 100 104 99 113 101 105 71 ## Fairly even distribution: BUT probability for a count <= 71 in a ## specific interval is: > pbinom(71,1000,.1) [1] 0.0008603278 ### This is too small !! Bad random numbers! ## The next 10 batches of runif(1000) numbers gave smallest interval ## counts resp. equal to 85, 79, 93, 88, 93, 86, 79, 87, 85, 90 > round(c(mean(slst2$xvec), mean(slst2$yvec), var(cbind(slst2$xvec, slst2$yvec))), 4) [1] 0.4871 -0.5194 0.0796 0.0724 0.0724 0.3273 ### cf. .5, -.5 .0833 .0833 .0833 .3333 Theor. values ### Now exhibit residuals epsilon for slst3 , through fractions ### falling in ranges (-10,-1.5), (-1.5,-.5), (-.5,.5), (.5,1.5), (1.5,10) > eps3 <- slst3$yvec - 2 - .2*slst3$xvec hist(eps3, breaks=c(-10,-1.5,-.5,.5,1.5,10), plot=F) .. $counts: [1] 72 234 374 241 79 ### compare with theoretical expected values > round(diff(pnorm(c(-10,-1.5,-.5,.5,1.5,10)))*1000,2) [1] 66.81 241.73 382.92 241.73 66.81 ### OK !!! ### And again: means and (co)variances followed by theoretical values > round(c(mean(slst3$xvec), mean(slst3$yvec), var(cbind(slst3$xvec, slst3$yvec))), 4) [1] 0.0137 2.0281 1.0110 0.1638 0.1638 1.0816 ### 0 2 1 .2 .2 1.04 Theor. values (5) Recall from class that in order to achieve precision delta with probability at least 1-alpha, we apply the formula B > = (qnorm(1-alpha/2)^2 * sigst^2 / delta^2 where sigst is any upper bound or estimate for the standard deviation of the simulated iid variables. (a) Since alpha=delta=1e-2, and since the variables X are uniformly bounded between 0 and sqrt(3), so that sigst <= 3/4, our formula gives B >= 2.576^2*.75/1.e-4 = 49768.32. So we use B=5e4. > tmpvec <- runif(5e4) tmpvec <- sqrt(1+tmpvec+tmpvec^4) sigest <- sqrt(var(tmpvec)) c(mean(tmpvec) + c(-1,1)*qnorm(.995)*sigest/sqrt(5e4), sigest/0.75) [1] 1.2878769 1.2924819 0.2665071 ### So the estimated integral is 1.2902, with precision a bit better ### than required. The exact answer is given by: > tmpl <- integrate(function(x) sqrt(1+x+x^4),0,1) c(tmpl$integral, tmpl$abs.error) [1] 1.288417e+00 1.066592e-11 ### OK, our error was +.00176 (b) Note that the r.v.'s of interest are indicators of whether or not ## there are k distinct values in a given simulation. These are ## all binary (ie Bernoulli) so have sigsq <= 0.25, so for ## alpha = .01/6, get the formula B >= qnorm(1-.01/12)^2*.25/1.e-4 ## = 24711.53 . (We divided by 6 --- using `Bonferroni's ## inequality' --- to ensure that the union of the seven event each ## of which may have probability up to .01/6, will altogether have ## probability <= .01. So we can use B = 25000; I used B=25500. > tmpmat <- array(0, dim=c(25500,10)) for(i in 1:6) { tmpvec <- sample(1:10,25500, replace=T) tmpmat[cbind(1:25500,tmpvec)] <- 1 } dvec <- c(tmpmat %*% rep(1,10)) > round(table(dvec)/25500, 4) 2 3 4 5 6 0.0029 0.0629 0.3275 0.4593 0.1474 ### Compare with true analytically calculated values: 0.0028 0.0648 0.3276 0.4536 0.1512 ### NB. The probability that only 1 distinct value occurs = 1e-5. ### The largest absolute error turns out to have been: .0057 ### which indicates that our Bonferroni-inequality calculation ### was not terribly conservative. *** Reasoning for exact calculation: for example, P(5 distinct) = 1e-6*( 10*{6 choose 2}*9*8*7*6) =.4536. *** The others can be done similarly (with more distinct cases). (6) > cathed <- read.table(file="Cathed.dat") > motif() > plot(cathed[,4], cathed[,3], type="n", xlab="Length (feet)", ylab="Height (feet)", main= "English Cathedral Heights vs Lengths, by Style") points(cathed[cathed[,2]==0,4],cathed[cathed[,2]==0,3], pch=5) points(cathed[cathed[,2]==1,4],cathed[cathed[,2]==1,3], pch=18) tm0 <- lm(V3 ~ V4, data=cathed[cathed[,2]==0,])$coef abline(tm0[1],tm0[2], lty=3) tm1 <- lm(V3 ~ V4, data=cathed[cathed[,2]==1,])$coef abline(tm1[1],tm1[2], lty=1) legend(locator(), legend=c("Romanesque","Gothic"), marks=c(5,18), lty=c(3,1)) printgraph(file="Graphic2.ps") (7) > motif() > betvec <- rbeta(1600,2,3) hist(betvec, nclass=40, prob=T, xlab="Simulated variate scale", ylab="Scaled relative frequency", main= "Scaled Rel Freq Histogram: Beta Data & Overlaid Density") lines((1:200)/200, dbeta((1:200)/200,2,3), lty=2) (8) (a) Here the point is that, in using the "bivariate probability integral transform" method discussed in class, you can work either with F_X and F_{Y|X} or with F_Y and F_{X|Y}. If the objective is to do everything analytically, then F_Y and F_{X|Y} works better, so that will be the way I do it first (Method 1). The other way works too, but uses a little more in the way of Splus trickery because F_X, while simple, cannot be analytically inverted. That will be Method 2, and I use the two methods to check each other. ## First we exhibit the marginal and conditional df's. > Fx <- function(x) 1 - 1.5/x^2 + .5/x^3 Fy <- function(y) 1-1.5/y + .5/y^2 Finvy <- function(r) .75*(1 + sqrt(1-8*(1-r)/9))/(1-r) FYcondX <- function(x,y) ifelse(y <= x, (y-1)/(2*x-1), 1 - x^2/(y*(2*x-1))) FXcondY <- function(x,y) ifelse(x <= y, 1.5*(1-1/x)/(1.5-1/y), 1 - .5*(y/x)^3/(1.5*y-1)) FYinvcondX <- function(r,x) ifelse(r <= FYcondX(x,x), (2*x-1)*r+1, x^2/((2*x-1)*(1-r))) FXinvcondY <- function(r,y) ifelse(r <= FXcondY(y,y), 1/(1-r*(1-2/(3*y))), y*(.5/((1-r)*(1.5*y-1)))^(1/3)) ## NB we can test these inverses simply, eg by generating ## arbitrary X values on (1,infty) and R on (0,1) and ## then making sure that FYcondX(FYinvcondX(R,X),X)=R > Xtmp <- exp(rexp(100)); Rtmp <- runif(100) > sum(abs(FYcondX(Xtmp, FYinvcondX(Rtmp,Xtmp))-Rtmp)) [1] 1.259409e-15 ### this is =0, OK > sum(abs(FXcondY(FXinvcondY(Rtmp,Xtmp),Xtmp)-Rtmp)) [1] 5.972653e-15 ### OK again > sum(abs(Fy(Finvy(Rtmp))-Rtmp)) [1] 2.664535e-15 ### OK again METHOD 1. > Ytmp <- Finvy(runif(1000)) Xtmp <- FXinvcondY(runif(1000),Ytmp) ### We display the histogram of log(Xtmp) versus ### the theoretical density exp(z)*fx(exp(z)) > hist(log(Xtmp), nclass=36, prob=T, main= "Scaled Histogram of log(X)") > lines((1:400)/100 -> lxp, 3*exp(-2*lxp)-1.5*exp(-3*lxp), lty=3) ### Virtually perfect agreement > hist(log(Ytmp), nclass=36, prob=T, main= "Scaled Histogram of log(Y)") lines((1:400)/100 -> lxp, 1.5*exp(-lxp)-exp(-2*lxp), lty=3) ### Again very close agreement METHOD 2. > xpt <- exp((0:400)/80) > Xtmp2 <- predict.smooth.spline(smooth.spline(Fx(xpt), xpt, spar=1.e-6), runif(1000))$y Ytmp2 <- FYinvcondX(runif(1000), Xtmp2) (b) Here there are simple formulas: X = R*cos(Th), Y = R*sin(Th) where R ~ F_R(r) = (2/pi)*arc tan(r^2) and Th ~ Unif(0,2*pi) are independent. > Thvec <- 2*pi*runif(1000) Rvec <- sqrt(tan(pi*runif(1000)/2)) Xvec <- Rvec*cos(Thvec) Yvec <- Rvec*sin(Thvec) (9) (a) The region of interest is the set in the place with x-coordinates ranging from -5 to 5, and with y ranging from 0 to min((10 +2x)/3, 5 - x). It is easy to see that the density of X is not uniform but rather is proportional to the height (max y-coordinate) of the triangle for each value x in [-5,5]. The constant of proportionality is the reciprocal of the area of the triangle, ie fX(x) = .05*min((10+2x)/3, 5 - x). The corresponding df is FX(x) = (.05/3)*(x+5)^2 for x<1, and = 1 - .025*(5-x)^2 for 1<= x < 5. Thus: > XYfcn <- function(N) { Xval <- numeric(N) U1val <- runif(N) ind1 <- U1val <= .6 Xval[ind1] <- sqrt(60*U1val[ind1])-5 Xval[!ind1] <- 5-sqrt(40*(1-U1val[!ind1])) Yval <- pmin(10/3 + (2/3)*Xval, 5 - Xval)*runif(N) list(Xval=Xval, Yval=Yval) } An alternate method here is to generate distributions in the rectangle [-5,5] x [0,4], and reflect observations falling outside the triangle across the lines y=10/3 + (2/3)*x if x<1 and across y=5-x if x>1, to obtain points in the tirangle in every case: > XYfcn2 <- function(N) { Xmat <- cbind(-5 + 10*runif(N), 4*runif(N)) inds1 <- Xmat[,1] < 1 & Xmat[,2] > 10/3 + (2/3)*Xmat[,1] Xmat[inds1,] <- outer(rep(2,sum(inds1)), c(-2,2),"*") - Xmat[inds1,] inds2 <- Xmat[,1] >= 1 & Xmat[,2] > 5-Xmat[,1] Xmat[inds2,] <- outer(rep(2,sum(inds2)), c(3,2),"*") - Xmat[inds2,] list(Xval=Xmat[,1], Yval = Xmat[,2]) } tmp2 <- XYfcn2(1000) When plotted, both of the two 1000-vectors look uniform over the triangle. For a more precise test, we do a chi-square with cells defined by x-values in intervals [-5,-3], [-3,-1], [-1,1], [1,3], [3,5] and with y-values in intervals [0,2] and [2,4]. > t1 <- table((tmp$Xval+1) %/% 2 , (tmp$Yval %/% 2)) > t1 0 1 -2 58 0 -1 180 18 0 196 129 1 218 110 2 91 0 > t2 <- table((tmp2$Xval+1) %/% 2 , (tmp2$Yval %/% 2)) t2 0 1 -2 63 0 -1 181 14 0 184 146 1 195 106 2 111 0 ** The theoretical areas of the intersections of the desired triangle with the rectangular cells, expressed as a fraction of the area 20 of the whole triangle, are as follows: > ea <- round(.05*matrix(c(2*.5*(4/3),0,2*(.5*(5/3)+.5*(6/3)), 1/3, 4, 8/3, 4, 2, 2, 0), byrow=T, ncol=2)*1000,2) > ea [,1] [,2] [1,] 66.67 0.00 [2,] 183.33 16.67 [3,] 200.00 133.33 [4,] 200.00 100.00 [5,] 100.00 0.00 ### Now for the 2 chi-square statistics on 5 df : > c(sum(((c(t1)-c(ea))^2/c(ea))[c(ea)>0]), sum(((c(t2)-c(ea))^2/c(ea))[c(ea)>0])) [1] 4.944696 4.838281 #### OK !!! (b) The rejection method is easier to program. Note that since we should reject about half of the random numbers, to get 1000 pairs it should suffice to generate 2200. (To check this, note that > pbinom(2000,2200,.5) ### = 1 More generally, we propose to generate 400 + 2.2*N pairs. > XYfcn3 <- function(N) { naux <- ceiling(400+2.2*N) Xmat <- cbind(-5+10*runif(naux), 4*runif(naux)) inds <- (1:naux)[(Xmat[,1] < 1 & Xmat[,2] < 10/3+ (2/3)*Xmat[,1]) | (Xmat[,1] >= 1 & Xmat[,2] < 5-Xmat[,1])] list(Xval=Xmat[inds[1:N],1], Yval=Xmat[inds[1:N],2]) } tmp3 <- XYfcn3(1000) > sum(((c(table((tmp3$Xval+1) %/% 2 , (tmp3$Yval %/% 2))) - c(ea))^2/c(ea))[c(ea)>0]) ### now chisq = 5.727582 (10) Here we do not have the luxury of analytically known Finv or even F . First evaluate numerical integral via Simpson's rule, then invert using the smoothing-spline trick. ### First tabulate cdf at points k/10, k=0:100 > xpt <- (0:200)/10 fpt <- 0.3851565*log(1+xpt)*xpt^2*exp(-xpt) fpt[1] <- 0 Fpt <- pmin((6/60)*cumsum(fpt)[seq(1,201,2)],1) xpt <- xpt[seq(1,201,2)] tspl <- smooth.spline(Fpt,xpt, spar=1.e-6) ### Now with this pre-processing done: to generate 1000 desired ### points: > Xvec <- predict.smooth.spline(tspl, runif(1000))$y ### density looks bimodal!! But there is a unique root for its ### derivative, near 2. > summary(Xvec) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.2057 2.0470 3.0320 3.3280 4.2550 11.9500 > hist(Xvec,nclass=36, prob=T) > lines((0:200)/10, fpt) ### very nice fit !! ## To solve part (a) as requested, in a function, can take > ftmp1 <- function(uvec) predict.smooth.spline(tspl, uvec)$y > Xvec2 <- predict.smooth.spline(tspl, runif(1000))$y > summary(Xvec2) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.2039 2.1330 3.1960 3.5330 4.5120 12.9600 ### looks very much the same ### For both cases, let's check that we get the right relative ### frequency on (3.5,7): > integrate(function(x) 0.3851565*log(1+x)*x^2*exp(-x), 1.e-6,7)$integral ### = 0.9493278 > integrate(function(x) 0.3851565*log(1+x)*x^2*exp(-x), 1.e-6,3.5)$ value integral ### = 0.5621234 > 0.9493278-0.5621234 [1] 0.3872044 > c(sum(Xvec < 7 & Xvec > 3.5), sum(Xvec2 < 7 & Xvec2 > 3.5))/1000 [1] [1] 0.355 0.399 ### close enough to the theoretical value 0.387 ### For accept/reject method, we must find a density g(x) from which we know how to simulate, such that f(x) <= k*g(x) for a not-too-large constant k. The simplest way is to use the inequality log(1+x) < x to take g(x) proportional to x^3*exp(-x), i.e., Gamma(4,1). ### So g(x) = (1/6)*x^3*exp(-x), and f(x) <= 6c g(x), and k=6*c = 2.310939. > XsimAcc <- function(N, fac=4) { ### Simulate fac*N + 300 variates naux <- ceiling(fac*N+300) Xaux <- rgamma(naux,4) Uaux <- runif(naux)*2.310939 Xaux[(1:naux)[Uaux < 2.310939*log(1+Xaux)/Xaux]][1:N] } > Xvec3 <- XsimAcc(1000, 5) summary(Xvec3) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.3199 2.2220 3.1540 3.4660 4.4400 13.3500 > integrate(function(x) 0.3851565*log(1+x)*x^2*exp(-x), 1.e-6,2)$integral [1] 0.2051833 > integrate(function(x) 0.2600396*(1+log(x))*x^2*exp(-x), 1.e-6,4)$integral [1] 0.661278 > integrate(function(x) 0.2600396*(1+log(x))*x^2*exp(-x), 1.e-6,8)$integral [1] 0.9754157 ### So let's do chisquare (with 3df) on all three generated ## X-vectors. > evec <- diff(c(0,0.2051833 , 0.661278, 0.9754157, 1))*1000 > evec [1] 205.1833 456.0947 314.1377 24.5843 > sum((hist(Xvec, breaks=c(0,2,4,8,20), plot=F)$counts-evec)^2/evec) [1] 9.732789 ### NB p-value = 0.0210 > sum((hist(Xvec2, breaks=c(0,2,4,8,20), plot=F)$counts-evec)^2/evec) [1] 5.10709 > sum((hist(Xvec3, breaks=c(0,2,4,8,20), plot=F)$counts-evec)^2/evec) [1] 4.73794 > hist(Xvec3, breaks=c(0,2,4,8,20), plot=F)$counts [1] 191 487 303 19 ### For the timing of the methods, let's generate 1e6 variates: > unix.time(xtmp <- predict.smooth.spline(tspl, runif(1e6))$y) [1] 3.07 0.39 5.75 0.00 0.00 > unix.time(xtmp2 <- XsimAcc(1e6, 2.5)) ### This should be a large enough factor for such a large ### number of variates: ### by LLN, any factor > k =2.310939 should work! [1] 10.35 0.97 27.61 0.00 0.00 ### The accept-reject scheme is much less efficient, both because of the many extra random numbers being generated and because of the mathematical operations like log's.(The predict.spline.smooth evaluation is just linear algebra. NOTE that the elapsed-time value (the third of the times produced) is not a good measure of efficiency when you are using a shared-resource network. But the CPU time IS. (11) gmix <- function(N) { xout <- numeric(N) eps <- rbinom(N,1,.3) naux <- sum(eps) xout[eps==1] <- rnorm(naux) xout[eps==0] <- rlogis(N-naux)*1.5 xout } > xtmp <- gmix(1000) > summary(xtmp) Min. 1st Qu. Median Mean 3rd Qu. Max. -13.25289 -1.15870 0.00146 0.01452 1.23705 14.89885 > diff(c(0, 0.3*pnorm(c(-5,-1,.5,4))+.7*plogis(c(-5,-1,.5,4)/1.5),1)) [1] 0.02411172 0.26095540 0.33017076 0.33927420 0.04548792 expec <- 1000*.Last.value obs <- hist(xtmp, breaks=c(-15,-5,-1,.5,4,20), prob=F, plot=F)$count > sum((obs-expec)^2/expec) [1] 2.217372 ### Very moderate for chi-square 5df !! (b) First generate triple mm of counts of observations falling intorespective intervals (-Inf, -1], (-1,1], (1,Inf) (with respective probabilities .2/e = .0736, 1-.5/e = .8161, .3/e = .1104. Then simulate within these intervals from corresponding densities using exponentials for extreme intervals and rejection method using normals for middle interval. NB. We can actually use the rejection method for the middle interval to choose those observations falling in the tails, as follows, using the facts > c(qnorm(.2/exp(1)), qnorm(1-.3/exp(1))) [1] -1.449666 1.224595 > ThrSim <- function(N, fac=2) { rnvec <- rnorm(N) mm1 <- sum(rnvec < -1.449666) mm3 <- sum(rnvec > 1.224595) ind <- (1:N)[rnvec >= -1.449666 & rnvec <= 1.224595] rnaux <- c(rnvec[ind], rnorm(200+sum(ind)*(fac-1))) outvec <- numeric(N) outvec[rnvec < -1.449666] <- -1-rexp(mm1) outvec[rnvec > 1.224595] <- 1+rexp(mm3) outvec[ind] <- rnaux[abs(rnaux)<1][1:(N-mm1-mm3)] list(outvec = outvec, mcounts =c(mm1,N-mm1-mm3,mm3), normVal=ind) } > dout <- ThrSim(1000) > dout$mcounts/1000 [1] 0.078 0.818 0.104 ### These numbers match well > chisq.gof(dout$outvec[dout$outvec> 1]-1, dist="exponential", rate=1/mean(dout$outvec[dout$outvec> 1]-1), n.param.est=1) Chi-square Goodness of Fit Test data: dout$outvec[dout$outvec > 1] - 1 Chi-square = 6.25, df = 11, p-value = 0.8562 ### OK !!! > chisq.gof(-dout$outvec[dout$outvec < -1]-1, dist="exponential", rate=1/mean(-dout$outvec[dout$outvec < -1]-1), n.param.est=1) Chi-square Goodness of Fit Test data: - dout$outvec[dout$outvec < -1] - 1 Chi-square = 11.2308, df = 10, p-value = 0.3398 ### OK. (12) There are a few different ways to do this. The ones I will cover are three: (a) multivariate probability integral transform method (very messy!), (b) same method applied to affinely transformed data (very simple!), (c) naive accept-reject (slightly wasteful but very simple). METHOD 0. This is a really ugly method, when applied to the untransformed problem. As an indication, find the marginal density f_Y(y) = c* I[0 integrate(function(y) (1-y)* pmin(y,1.5-y),0,1)$integral + integrate(function(y) 0.5*((1-y)^2 - pmax(0,y-0.5)^2), 0,.75)$integral [1] 0.3229167 This agrees to seven decimal places with the value 31/96 calculated below, under method 1. METHOD 1. We know how to do change-of-variable based on affine transformations. Here let u=x, v=x+y, w=x+z. Then the mapping (x,y,z) -> (u,v,w) is evidently linear and invertible, with Jacobian determinant 1: so the inverse preserves volumes, and the region of interest in (u,v,w) coordinates is (v,w) in (0,1)^2, max(0.5*(v+w)-0.75, 0) < u < min(v,w) However, we must still do the necessary integrations if we want to use the (trivariate) probability integral transform method, yielding f_{V,W}(v,w) = c * (min(v,w) - max(0, 0.5*(v+w) - 0.75)), for v, w in (0,1); and then f_V(v) = c*(v - 0.5*v^2 - 0.25*(max(0,v-0.5))^2); so that F_V(v) = c*( 0.5*v^2 - v^3/6 - (1/12)*(max(0,v-0.5))^3) ## Setting v=1, F_V(1)=1, gives the value c=96/31 mentioned above. ## NOTE that direct numerical inversion of this distribution function ## is not feasible in parallel fashion, so we resort to tabulation, ## spline-interpolation of the inverse, etc. F_{W|V}(w|v) = (c/F_V(v))*(0.5*min(v,w)^2+ v*max(w-v,0) - 0.25*I[v>0.5]*(v+w-1.5)^2) > fv <- function(v) (96/31)*(v - 0.5*v^2 - 0.25*(pmax(0,v-0.5))^2) FV <- function(v) (96/31)*( 0.5*v^2 - v^3/6 - (1/12)*(pmax(0,v-0.5))^3) FWcondV <- function(w,v) (0.5*pmin(v,w)^2 + v*pmax(w-v,0) - 0.25* pmax(0,v+w-1.5)^2)/(v - 0.5*v^2 - 0.25*(pmax(0,v-0.5))^2) #### OK, now we can use the tabulate/interpolate idea for V, and explicit inversion for FWcondV: > FWinvV <- function(r,v) { fout <- numeric(length(r)) ## assumes r, v have same length vsm <- (v < 0.5) wltv <- (r < FWcondV(v,v)) fout[vsm & wltv] <- { vaux <- v[vsm & wltv] sqrt(2*r[vsm & wltv]*vaux*(1 - 0.5*vaux)) } fout[vsm & !wltv] <- { vaux <- v[vsm & !wltv] r[vsm & !wltv]*(1 - 0.5*vaux) + vaux/2 } vwsumg <- (r >= FWcondV(pmin(1.5-v,1),v)) if(sum(!vsm)) { auxb <- (!vsm & !vwsumg) cc <- 96/31 if(sum(auxb)) { fout[auxb & wltv] <- { vaux <- v[auxb & wltv] sqrt(2*r[auxb & wltv]*fv(vaux)/cc) } fout[auxb & !wltv] <- { vaux <- v[auxb & !wltv] r[auxb & !wltv]*fv(vaux)/(cc*vaux) + vaux/2 } } auxb <- (!vsm & vwsumg) if(sum(auxb)) { fout[auxb & wltv] <- { vaux <- v[auxb & wltv] sqrt(4*r[auxb & wltv]*fv(vaux)/cc+2*(vaux-1.5)^2) + vaux-1.5 } fout[auxb & !wltv] <- { vaux <- v[auxb & !wltv] vaux+1.5 - sqrt(vaux*(6-2*vaux)-4*r[auxb & !wltv]*fv(vaux)/cc)} } } fout } > vt <- runif(1000) wt <- runif(1000) rt <- FWcondV(wt,vt) summary(FWinvV(rt,vt)-wt) Min. 1st Qu. Median Mean 3rd Qu. Max. -2.220e-16 0.000e+00 0.000e+00 -1.629e-18 0.000e+00 2.220e-16 ### Up to here, we have tested that FWinvV correctly inverts FWcondV > M1 <- function(N, spacng=.01) { spts <- (0:100)/100 Vspl <- smooth.spline(FV(spts),spts, spar=1.e-6) Vsim <- predict.smooth.spline(Vspl,runif(N))$y Wsim <- FWinvV(runif(N),Vsim) Usim <- runif(N, pmax((Vsim+Wsim)/2-.75,0), pmin(Vsim,Wsim)) cbind(x=Usim, y=Vsim-Usim, z=Wsim-Usim) } METHOD 2. Now we have only to generate a series of X,Y,Z values directly in the cube (0,1)^3 and reject those that fall outside the region. > M2 <- function(N, fac=1/0.321) { naux <- 200 + N*fac xyzmat <- matrix(runif(3*naux), ncol=3, dimnames=list(NULL, c("x","y","z"))) bool <- xyzmat[,1]+xyzmat[,2] < 1 & xyzmat[,1]+xyzmat[,3] < 1 & xyzmat[,2]+xyzmat[,3] < 1.5 list(xyzmat = xyzmat[bool,][1:N,], volum=mean(bool)) } > M2(1000)[[2]] [1] 0.3309201 > M2(1.e6)[[2]] [1] 0.3226401 ### took 10 minutes on Sparc 5 !? ### OK, now generate two different matrices 1000 x 3: > mat1 <- matrix(unlist(M1(1000)), ncol=3) mat2 <- M2(1000) ### recall this is a list by def'n > mat2$volum [1] 0.3257919 ### OK, still agrees with volumes above > mat2 <- mat2$xyzmat ## For chi-square use the cells based on x,y,z < .5 or >= .5 ## NB the cells with x,y>.5 or x,z > .5 are empty (4 in all) > t1 <- table( mat1[,1] >= .5, mat1[,2] >= .5, mat1[,3] >= .5) t2 <- table( mat2[,1] >= .5, mat2[,2] >= .5, mat2[,3] >= .5) > t1 , , = FALSE FALSE TRUE FALSE 401 186 TRUE 122 0 , , = TRUE FALSE TRUE FALSE 190 101 TRUE 0 0 > t2 , , = FALSE FALSE TRUE FALSE 386 177 TRUE 126 0 , , = TRUE FALSE TRUE FALSE 199 112 TRUE 0 0 > sum(t1+t2>0) [1] 5 ### so the chi-square will have 4 df > sum(((t1-t2)^2/(t1+t2))[t1+t2>0]) [1] 1.349854 #### Convincingly small !!! Method 3. Suppose we had wanted only to test the validity of M2 without working so hard to establish the correct marginal and onditional densities. It would have sufficed to create a rejection- method simulation which worked differently from (and so might be expected to have errors, if any, different from) M2. For example, let's use the same transformation as in Method 1; create (V,W) by a reject-accept method using the known density f_{V,W}; and then generate U conditionally uniform in [max(0.5*(v+w)-0.75, 0), min(v,w)]. > M3 <- function(N, fac=5) { naux <- 200 + N*fac vwrmat <- matrix(runif(2*naux), ncol=2) vwrmat <- vwrmat[runif(naux) < pmin(vwrmat[,1], vwrmat[,2]) -pmax(0,0.5*(vwrmat[,1]+vwrmat[,2])-0.75),][1:N,] uvec <- runif(N, pmax(0,0.5*(vwrmat[,1]+vwrmat[,2])-0.75), pmin(vwrmat[,1], vwrmat[,2])) xyzmat = cbind(x=uvec, y=vwrmat[,1]-uvec, z=vwrmat[,2]-uvec) } > mat3 <- M3(1000) > t3 <- table(mat3[,1]>0.5, mat3[,2]>0.5, mat3[,3]>0.5) > t3 , , = FALSE FALSE TRUE FALSE 363 215 TRUE 125 0 , , = TRUE FALSE TRUE FALSE 204 93 TRUE 0 0 > sum(((t3-t2)^2/(t3+t2))[t3+t2>0]) [1] 6.216943 ### still OK for 4 df chisq > 1-pchisq(6.217,4) ### pvalue = 0.184 (13) > Trapez <- function(gfcn, a, b, m) { hh <- (b-a)/m gvec <- gfcn(seq(a,b, by=hh)) cumsum(c(0, gvec[1:m]+gvec[2:(m+1)]))*(hh/2) } SimpRul <- function(gfcn, a, b, m) { hh <- (b-a)/(2*m) gvec <- gfcn(seq(a,b, by=hh)) cumsum(c(0,gvec[seq(1,2*m-1, by=2)]+4*gvec[(1:m)*2] + gvec[seq(3,2*m+1, by=2)]))*(hh/3) } (cumsum(c(0, gvec[1:(2*m-1)]+4*gvec[2:(2*m)] + gvec[3:(2*m+1)]))*(hh/3))[seq(1,2*m+1, by=2)] > summary(pgamma(seq(0,7,by=1/10), shape=2.5) - Trapez(function(x) dgamma(x, shape=2.5), 0, 7, 70)) Min. 1st Qu. Median Mean 3rd Qu. -0.0002429525 0.0000719017 0.0000917735 0.0000611028 0.0001241061 Max. 0.0001456788 > tmp <- SimpRul(function(x) dgamma(x, shape=2.5), 0, 7, 70) summary(pgamma(seq(0,7,by=1/10), shape=2.5) - tmp) Min. 1st Qu. Median Mean 3rd Qu. -6.350734e-06 -6.343742e-06 -6.340412e-06 -6.243459e-06 -6.340257e-06 Max. 0.000000e+00 (14) > NRaph <- function(x0, b, gfcn, gprimfcn, tol = 1e-06, maxit = 15) { xout <- x0 bmat <- matrix(b, ncol = if(is.null(ncol(b))) 1 else ncol(b)) g.old <- gfcn(x0, bmat) ind <- 1:length(x0) hlist <- NULL ctr <- 0 while(ctr < maxit & length(ind)) { xout[ind] <- xout[ind] - g.old/ gprimfcn(xout[ind], bmat[ind, ]) ctr <- ctr + 1 ind <- ind[abs(g.old) > tol] g.old <- gfcn(xout[ind], bmat[ind, ]) ### hlist is a list withindices used in successive iterations hlist <- c(hlist, list(ind)) } list(xout=xout, hlist = hlist) } > g <- function(x, b) 4 * x^3 - 3 * x^2 - b gprim <- function(x,b) 12*x^2 - 6*x > tmpb <- runif(20) tmpx <- NRaph(rep(1,20), tmpb)$xout ### cannot use initial x0 = 0 or 0.5, or else gprim=0 > round(tmpx,5) [1] 0.77220 0.95311 0.79953 0.96977 0.90097 0.91101 0.88998 0.82857 0.90248 [10] 0.87705 0.90143 0.94297 0.88818 0.84548 0.95940 0.80978 0.90905 0.96719 [19] 0.77900 0.79330 > for (i in 1:20) cat(round(c(tmpb[i], optimize(g,c(-10,10), b=tmpb[i])$min),5), "\n") 0.05295 0.77221 0.73802 0.95311 0.12665 0.79952 0.82675 0.96977 0.49021 0.90097 0.53452 0.91102 0.4435 0.88998 0.21575 0.82856 0.49676 0.90248 0.39091 0.87704 0.49218 0.90143 0.68635 0.94297 0.43601 0.88818 0.273 0.84547 0.77099 0.95941 0.15679 0.80977 0.52573 0.90906 0.81266 0.96719 0.07039 0.779 0.109 0.79332 (15) Simulate using b0=3.6, > nvec <- c(100,50, 120,80) pvec <- 0.1*log(1+3.6*(1:4)) mvec <- rbinom(4,nvec,pvec) > cbind(nvec=nvec, pvec=pvec, xv=1:4, mvec=mvec, Emvec=round(nvec*pvec,2)) nvec pvec xv mvec Emvec [1,] 100 0.1526056 1 19 15.26 [2,] 50 0.2104134 2 13 10.52 [3,] 120 0.2468100 3 30 29.62 [4,] 80 0.2734368 4 24 21.87 ### (exp(10*mvec[3]/nvec[3])-1)/3 [1] 3.727498 ### will use this as starting value ### Other possible choices would have been: > (exp(10*mvec/nvec)-1)/(1:4) [1] 5.685894 6.231869 3.727498 4.771384 ### NB all are greater than the true value 3.6, ### because it just happens that all observed ### numbers of successes are larger than their ### corresponding expected numbers. > lLikHW15 <- function(b) { pvb <- .1*log(1+b*(1:4)) ## vector sum(mvec*log(pvb)+(nvec-mvec)*log(1-pvb)) } ## Note that this function does not make sense for ## b greater than (exp(10)-1)/4 = 5506.366 > optimize(lLikHW15, c(0,5000), max=T)[c(1,2,5)]$maximum: [1] 4.820766 $objective: [1] -193.9393 $message: [1] "normal termination" > c(lLikHW15(4.8208), lLikHW15(3.6)) [1] -193.9393 -194.6267 ### The likelihood is greater at the MLE of 4.82 ### than at the true value, but not MUCH greater. > NRaph2 <- function(x0, gfcn, gprimfcn, tol = 1e-06, maxit = 15) { xout <- x0 g.old <- gfcn(x0) ctr <- 0 while(ctr < maxit & abs(g.old) > tol) { xout <- xout - g.old/gprimfcn(xout) ctr <- ctr + 1 g.old <- gfcn(xout) } list(xout=xout, niter=ctr, logLik=lLikHW15(xout), Hess=gprimfcn(xout)) } > tmpo <- NRaph2(3.7275, function(b) { pv <- .1*log(1+b*(1:4)) sum((0.1*(1:4)/(1+b*(1:4)))*(mvec-nvec*pv)/(pv*(1-pv)))}, function(b) { pv <- .1*log(1+b*(1:4)) -0.1* sum(((1:4)/(1+b*(1:4)))^2*( (mvec-nvec*pv)/ (pv*(1-pv))+0.1*(mvec/pv^2+(nvec-mvec)/(1-pv)^2)))}) $xout: [1] 4.820771 $niter: [1] 4 $logLik: [1] -193.9393 $Hess: [1] -0.6785414 > xt <- seq(2,7,by=.05) llk <- numeric(101) for(i in 1:101) llk[i] <- lLikHW15(xt[i]) plot(xt,llk) abline(h=tmpo$logLik - 0.5*qnorm(.975)^2, lty=3) ### This line shows that all values b within (2.9,8) ### would lie within the "confidence interval" ## Another way to view this is through the Wald CI > tmpo$xout - c(-1,1)*qnorm(.975) / tmpo$Hess [1] 1.932275 7.709266 ### We can see that this "interval" is skewed wrt ### LRT test-based region because the logLik function ### is asymmetric near the max !! ### Chi-square goodness of fit: > pvopt <- .1*log(1+tmpo$xout*(1:4)) sum((mvec-nvec*pvopt)^2/(nvec*pvopt*(1-pvopt))) [1] 0.6289064 ### compare with chisq 3df (16) SIMULATION of data: ## Start with correlated normal observations. > xvec <- rnorm(1000)*0.4 dmat <- cbind(rep(1,1000), xvec, -3*xvec+rnorm(1000)*0.7) > apply(dmat,2,summary) xvec Min. 1 -1.156850840 -4.83032127 1st Qu. 1 -0.259528092 -0.97472353 Median 1 0.022094641 -0.09772142 Mean 1 0.007300602 -0.05061261 3rd Qu. 1 0.271476445 0.84269223 Max. 1 1.194744110 4.82130422 > b1vec <- c(-1.3, .5, .25) scors <- c(dmat %*% b1vec) summary(exp(scors)) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.1480496 0.2360536 0.2704901 0.2757545 0.3074749 0.5199690 > ydat <- rbinom(1000,1,exp(scors)) > outmat <- cbind(round(dmat,6), ydat) write(t(outmat),file="/home1/evs/public_html/s798c/Data/Dset3D.asc", ncolumns=4) Analysis of "outmat" imported from Data/Dset3D.asc --------------------------------------------------- ### True beta is c(-1.3, .5, .25) > dfram <- data.frame(outmat[,2:4]) names(dfram) <- c("x1","x2","y") > glmtmp <- glm(cbind(y,1-y) ~ . , family=binomial, data=dfram) > glmtmp .. Coefficients: (Intercept) x1 x2 -0.967169 0.3817471 0.307846 Degrees of Freedom: 1000 Total; 997 Residual Residual Deviance: 1161.625 > c(lLikC(b1vec), lLikC(glmtmp$coef)) [1] -593.2391 -580.8126 ### NOTE that the logLik difference is much larger than would be ### expected (one half of a chi-square 3df rv) if the model ### were properly specified !!! > tmpmax <- nlmin(function(b) -lLikC(b), rep(0,3)) > tmpmax $x: [1] -0.9671691 0.3817473 0.3078461 $converged: [1] T $conv.type: [1] "relative function convergence" ============================================================== ### Goodness of fit CI's > rbind(b1vec, fitted=glmtmp$coef, s.dev=summary(glmtmp)$coef[,2]) (Intercept) x1 x2 b1vec -1.30000000 0.5000000 0.25000000 fitted -0.96716901 0.3817471 0.3078460 s.dev 0.07163044 0.3490234 0.1012989 > rbind(ulim = glmtmp$coef + 1.96*summary(glmtmp)$coef[,2], llim = glmtmp$coef - 1.96*summary(glmtmp)$coef[,2]) (Intercept) x1 x2 ulim -0.8267733 1.0658331 0.5063918 llim -1.1075647 -0.3023388 0.1093002 These give CI's based on "properly specified" Iobs-based analysis. The comparison below with Jobs shows that this is reasonable. Other, test-based, CI's: for this we need to re-calculate log-likelihoods with maximized values substituted for all coordinates other than the one of interest. ### Begin with intercept: > likAlt1 <- function(b0) { auxmin <- nlminb(glmtmp$coef[2:3], function(b2dim,B0) -lLikC(c(B0,b2dim)), B0=b0) c(rMLE = auxmin$param, lLik= -auxmin$obj) } > likAlt1(glmtmp$coef[1]) rMLE.x1 rMLE.x2 lLik 0.3817471 0.307846 -580.8126 > likAlt1(-.7) rMLE.x1 rMLE.x2 lLik 0.375994 0.2926115 -588.0464 > likAlt1(-1.2) rMLE.x1 rMLE.x2 lLik 0.3964614 0.3298038 -585.8997 > xt <- seq(-1.3,-.7, length=100) yt <- numeric(100) for(i in 1:100) yt[i] <- likAlt1(xt[i])[3] plot(xt, yt) abline(h=-580.813-1.92) ## -582.733 threshold ### graphically read off CI for intercept: (-1.11, -0.83) > uniroot(function(b) likAlt1(b)[3]+582.733, c(-1.2,glmtmp$coef[1]))$root [1] -1.10913 > uniroot(function(b) likAlt1(b)[3]-(-580.813-1.92), c(glmtmp$coef[1],-0.7))$root [1] -0.8282462 ### This reproduces the Wald interval above for Intercept *** Now for x1 interval by test-based method: > likAlt2 <- function(b1) { auxmin <- nlminb(glmtmp$coef[c(1,3)], function(b2dim,B1) -lLikC(c(b2dim[1],B1,b2dim[2])), B1=b1) c(rMLE = auxmin$param, lLik= -auxmin$obj) } > c(uniroot(function(b) likAlt2(b)[3]+582.733, c(-.6,glmtmp$coef[2]))$root, uniroot(function(b) likAlt2(b)[3]+582.733, c(glmtmp$coef[2],1.5))$root) [1] -0.3019839 1.0673748 ### OK, very close !! *** Now for x2 interval by test-based method: > likAlt3 <- function(b2) { auxmin <- nlminb(glmtmp$coef[1:2], function(b2dim,B2) -lLikC(c(b2dim[1:2],B2)), B2=b2) c(rMLE = auxmin$param, lLik= -auxmin$obj) } > c(uniroot(function(b) likAlt3(b)[3]+582.733, c(-.2,glmtmp$coef[3]))$root, uniroot(function(b) likAlt3(b)[3]+582.733, c(glmtmp$coef[3],.8))$root) [1] 0.1102483 0.5076699 ### OK, close again !! (I) Comparison of 2 ways to calculate Iobs ### The first is the one inverted above: > Iobs1 <- t(dmat) %*% (dlogis(c(dmat %*% glmtmp$coef))*dmat) > round(Iobs1,4) xvec 196.5368 -5.8305 24.7926 xvec -5.8305 31.7733 -94.4104 24.7926 -94.4104 378.2660 > Iobs2 <- t(dmat) %*% ((dfram[,3] - plogis(c(dmat %*% glmtmp$coef)))^2*dmat) round(Iobs2,4) xvec 196.4871 -6.4059 25.1193 xvec -6.4059 31.2392 -93.8838 25.1193 -93.8838 382.9509 (II) Goodness of fit statistics: > sum( (dfram[,3] - plogis(c(dmat %*% glmtmp$coef)))^2 )/ sum( dlogis(c(dmat %*% glmtmp$coef)) ) [1] 0.9997468 ### Ratio of variances SHOULD be 1 ! > sum( (dfram[,3] - plogis(c(dmat %*% glmtmp$coef)))^2 / dlogis(c(dmat %*% glmtmp$coef)) ) [1] 1000.947 ### Fine for chi-squared 300 df! > mtabl <- table(dfram[,1]>0, dfram[,2] > 0, dfram[,3]) mtabl ### These are the observed counts , , 0 FALSE TRUE FALSE 61 267 TRUE 349 46 , , 1 FALSE TRUE FALSE 18 135 TRUE 100 24 > etabl <- aggregate.data.frame(plogis(c(dmat %*% glmtmp$coef)), by= list(dfram[,1]>0, dfram[,2] > 0), sum) > etabl Group.1 Group.2 x 1 FALSE FALSE 19.26567 2 TRUE FALSE 103.74966 3 FALSE TRUE 131.91261 4 TRUE TRUE 22.07206 > sum((etabl[,3]-mtabl[,,2])^2/etabl[,3]) [1] 0.4593281 ### Very small chi-square value !! (17) Log to Simulate and Begin Analysis of Thoroughly Misspecified Linear Model Dataset Dset17.asc for HW17. ===================================================== The model: y_i ~ N(a + Xmat %*% b), c^2) is an ordinary linear regression model with scalar intercept a, error-variance c^2, and 3-dim regression coefficients b, altogether giving 5-dim unknown parameter. > Xmat <- cbind(rnorm(350)*0.4+0.5, rnorm(350)*(-1.2), rexp(350)*0.6) > dimnames(Xmat)[[2]] <- c("X1","X2","X3") > Yvec <- -1.6 + Xmat[,1]^2 - 0.5*Xmat[,2] + sqrt(Xmat[,3]) + rnorm(350)*.4 > Lfram <- data.frame(cbind(Xmat[,c(1,3)], Y=Yvec)) > write.table(Lfram,"/home1/evs/public_html/s798c/Data/Dset17.asc", sep=" ") ## NB transformed two variables and omitted the third ! > lm1 <- lm(formula= Y ~ . , data=Lfram) Call: lm(formula = Y ~ ., data = Lfram) Coefficients: (Intercept) X1 X3 -1.387107 1.06529 0.5260654 *** So for part (i), get this for the b0,b1,b2 coeff's and *** the estimated variance parameter is > summary(lm1)$sigma^2 [1] 0.581183 *** For part (ii), we re-do the estimation using nlminb: > lLikNR <- function(betsigsq) -(nrow(Lfram)/2)*log(betsigsq[4]) - (0.5/betsigsq[4])* sum((-betsigsq[1] + c(data.matrix(Lfram) %*% c(betsigsq[2:3],1)))^2) > nlminb(c(rep(0,3),1), function(x4) -lLikNR(x4), lower=c( rep(-Inf,3),1e-5), upper=rep(Inf,4))$param [1] -1.3871072 -1.0652906 -0.5260648 0.5762019 ## OK !! ## Recover cov.unscaled from summary(lm1)$sigma^2 and ## summary(lm1)$cov.unscaled > I0 <- solve(summary(lm1)$cov.unscaled) > round(I0,4) (Intercept) X1 X3 (Intercept) 350.0000 178.7370 231.0422 X1 178.7370 151.4908 124.0789 X3 231.0422 124.0789 293.3373 > Xm <- cbind(rep(1,350), data.matrix(Lfram[,1:2])) t(Xm) %*% Xm X1 X3 350.0000 178.7370 231.0422 X1 178.7370 151.4908 124.0789 X3 231.0422 124.0789 293.3373 > round(summary(lm1)$coef,3) Value Std. Error t value Pr(>|t|) (Intercept) -1.387 0.076 -18.353 0 X1 1.065 0.098 10.820 0 X3 0.526 0.064 8.171 0 > sqrt(diag(summary(lm1)$cov.unscaled))*summary(lm1)$sigma [1] 0.07558005 0.09846005 0.06438342 *** So for part (iii), we use these estimates and SE's *** to get Wald-type CI's: > rbind(lower = lm1$coef-1.96*summary(lm1)$coef[,2], upper = lm1$coef+1.96*summary(lm1)$coef[,2]) (Intercept) X1 X3 lower -1.535244 0.8723081 0.3998739 upper -1.238971 1.2582715 0.6522569 ### Now proceed to compare Iobs=Iobs1 vss Jobs=Iobs2 > Iobs1 <- t(Xm) %*% Xm /summary(lm1)$sigma^2 ### Full Iobs1 matrix also has sigma^2 row and column: > Iobs1 <- cbind(rbind(Iobs1,rep(0,3)), c(0,0,0, 175/summary(lm1)$sigma^4)) Iobs1 X1 X3 602.2199 307.5399 397.5377 0.0000 X1 307.5399 260.6594 213.4936 0.0000 X3 397.5377 213.4936 504.7244 0.0000 0.0000 0.0000 0.0000 518.0983 > tm <- c(Lfram$Y - Xm %*% lm1$coef) Iobs2 <- t((tm^2*Xm)) %*% Xm / summary(lm1)$sigma^4 > Iobs2 X1 X3 597.0580 274.7184 344.1382 X1 274.7184 278.5695 184.6571 X3 344.1382 184.6571 407.6772 ### This is like our Jobs in class, but so far just the ### upper-left 3x3 block. > baux <- c(t(tm*Xm) %*% (tm^2/summary(lm1)$sigma^2-1)/ (2*summary(lm1)$sigma^4)) > Iobs2 <- cbind(rbind(Iobs2,baux), c(baux, sum((tm^2/ summary(lm1)$sigma^2-1)^2/(4*summary(lm1)$sigma^4)))) > Iobs2 X1 X3 597.0580 274.71839 344.13823 125.75393 X1 274.7184 278.56954 184.65713 31.72295 X3 344.1382 184.65713 407.67716 32.08578 baux 125.7539 31.72295 32.08578 524.76716 ## Recall the raw standard errors of coefficients: > round(sqrt(diag(summary(lm1)$cov.unscaled))* summary(lm1)$sigma,5) [1] 0.07558 0.09846 0.06438 > sqrt(diag(solve(Iobs1))) [1] 0.07558005 0.09846005 0.06438342 0.04393331 ### Now we have the "robust sandwich" version: > RS.SE <- round(sqrt(diag(solve(Iobs1) %*% Iobs2 %*% solve(Iobs1))),5) [1] 0.08905 0.11936 0.05996 0.04422 ### So the "robust" calculation makes a noticeable difference, although not a large one, even in the asymptotic standard error for sigma^2. *** Part (iv) Robust CI's for each of Intercept, X1, X2: > rbind(lower=lm1$coef-1.96*RS.SE[1:3], upper=lm1$coef+1.96*RS.SE[1:3]) (Intercept) X1 X3 lower -1.561645 0.8313442 0.4085438 upper -1.212569 1.2992354 0.6435870 (18) (i) Example as in class. begin by maximizing logLik directly. > Tvec <- scan(file="Dset3B.dat") ## after saving data into file > lLikmix <- function(plam) sum(log((1-plam[1])*plam[2]*exp(-(plam[2]-2)*Tvec)+2*plam[1])) > nlminb(c(.5,2), function(pl) -lLikmix(pl), lower=c(1e-6,1e-5), upper=c(1-1e-6,1e2))[1:4] $parameters: [1] 0.3998813 0.3663359 $objective: [1] -2133.742 $message: [1] "RELATIVE FUNCTION CONVERGENCE" $grad.norm: [1] 0.0129577 ### took 18 iterations ... > plam <- .Last.value[[1]] ## Let's check this, roughly. The mixture density should ## have mean p/2 + (1-p)/lambda > c(plam[1]/2 + (1-plam[1])/plam[2], mean(Tvec)) [1] 1.838106 1.839836 #### OK !! (ii) EM algorithm: (poor starting point .5, 2 as above) > plamEM <- function(plam0) { probs <- 1/(1+(1/plam0[1]-1)*0.5*plam0[2]*exp( -(plam0[2]-2)*Tvec)) mp <- mean(probs) c(mp,(1-mp)/mean(Tvec*(1-probs))) } Next step: recursively calculate, and print results: > plam0 <- c(.5,2) for(i in 1:100) { plam0 <- plamEM(plam0) paux <- round(plam0,5) cat("iter",i,",p=", paux[1],",lam=",paux[2]," ") if(i %% 2 == 0) cat("\n") } ... iter 41 ,p= 0.39988 ,lam= 0.36634 iter 42 ,p= 0.39988 ,lam= 0.36634 ### Reach this value at iteration 41 and didn't change ... > lLikmix(c(.5,2)) [1] 693.1472 > lLikmix(c(0.39988, 0.36634)) [1] 2133.742 ### Let's confirm that the logLik never decreased: > plamLik <- numeric(101) plam0 <- c(.5,2) plamLik[1] <- lLikmix(plam0) for(i in 2:101) { plam0 <- plamEM(plam0) plamLik[i] <- lLikmix(plam0) } sum(diff(plamLik) < 0) > min(diff(plamLik)) [1] -5.911716e-12 (19) Of course we could simulate a multinomial(100,(a,(1+2a)/6, (1-a)/3, 1/2-a)) for a=.1, by: > table(sample(1:4,100,replace=T, prob=c(.1,.2,.3,.4))) 1 2 3 4 5 17 27 51 ## But that is not at all parallelizable, so to do it 10000 ## times would either involve doing a 10000x100 array or ## using a for-loop ! Instead, we simulate n1 ~ Binom(100,.1), n2 ~ Binom(100-n1,.2/.9), n3 ~ Binom(100-n1-n2, .3/.7), and n4=100-n1-n2-n3. nArr <- array(0, dim=c(1e4,4)) nArr[,1] <- rbinom(1e4,100,.1) nArr[,2] <- rbinom(1e4,100-nArr[,1],2/9) nArr[,3] <- rbinom(1e4,100-nArr[,1]-nArr[,2],3/7) nArr[,4] <- 100 - nArr[,1]-nArr[,2]-nArr[,3] T1vec <- (nArr[,1]-rep(10,1e4))^2/10 + (nArr[,2]-rep(20,1e4))^2/20 + (nArr[,3]-rep(30,1e4))^2/30 + (nArr[,4]-rep(40,1e4))^2/40 > summary(T1vec) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00000 1.20833 2.38333 2.99121 4.12500 19.45833 > summary(pchisq(T1vec,3)) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000000 0.2489937 0.5032553 0.4991340 0.7517211 0.9997802 ### If chi-square with 3 df were correct distribution, these ### values would be: 0, .25, .5, .5, .75, 1 > rowlik <- function(nmat, avec) (nmat * log(cbind(avec,(1+2*avec)/6, (1-avec)/3, 0.5-avec))) %*% rep(1,4) rowlikpr <- function(nmat,avec) (nmat/cbind(avec,(1+2*avec)/6, (1-avec)/3, 0.5-avec)) %*% c(1,1/3,-1/3,-1) rowlikdpr <- function(nmat,avec) (nmat/cbind(avec,(1+2*avec)/6, (1-avec)/3, 0.5-avec)^2) %*% c(-1,-1/9,-1/9,-1) > aold <- rep(0,1e4) anew <- rep(.1,1e4) ctr <- 0 iset <- 1:10000 while( ctr < 20 & length(iset)) { aold[iset] <- anew[iset] anew[iset] <- anew[iset] - rowlikpr(nArr[iset,],aold[iset])/ rowlikdpr(nArr[iset,],aold[iset]) iset <- iset[abs(anew[iset]-aold[iset]) > 1.e-5] ctr <- ctr+1 } summary(rowlikpr(nArr,anew)) Min. 1st Qu. Median Mean 3rd Qu. Max. -0.01989776 0.0000 0.0000 -0.00000189 0.00000003 0.00000428 > sum(abs(rowlikpr(nArr,anew)) > 1.e-5) [1] 1 ### Actually the one remaining case, #1230 was FAR from converged !! > anew[abs(rowlikpr(nArr,anew)) > 1.e-5] [1] -5025.066 > nArr[1230,] [1] 4 6 41 49 > optimize(function(a) -rowlik(nArr[1230,],a), c(1e-6,.49))$min [1] 0.02962868 > anew[1230] <- 0.02962868 ### This is a fairly extreme value !! ### There were also 102 cases where a negative anew value was chosen! > for(i in (1:10000)[anew < 0]) anew[i] <- optimize(function(a) -rowlik(nArr[i,],a), c(1e-6,.49))$min > summary(anew) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0132118 0.0814582 0.1000000 0.1003715 0.1183398 0.2111745 > T2vec <- (nArr[,1]-100*anew)^2/(100*anew) + (nArr[,2]-100*(1/6+anew/3))^2/(100*(1/6+anew/3)) + (nArr[,3]-(100/3)*(1-anew))^2/((100/3)*(1-anew)) + (nArr[,4]-100*(0.5-anew))^2/(100*(0.5-anew)) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000000 0.2612118 0.5057364 0.4993999 0.7444405 0.9998610 ### Again this is close to the desired values for Uniform[0,1] ### We finish off by testing the difference between empirical ### distribution function values for T1vec and T2vec and ### pchisq(.,3) and pchisq(.,3), at the points 1:10. > T1df <- numeric(10) -> T2df > for(i in 1:10) { T1df[i] <- sum(T1vec <= i)/1e4 T2df[i] <- sum(T2vec <= i)/1e4 } > cbind(T1val=T1df, T1CIwidth=round(1.96*sqrt(T1df*(1-T1df))/100,5), T2val=T2df, T2CIwidth=round(1.96*sqrt(T2df*(1-T2df))/100,5), Chi2 = round(pchisq(1:10,2),4), Chi3 = round(pchisq(1:10,3),4)) T1val T1CIwidth T2val T2CIwidth Chi2 Chi3 [1,] 0.1886 0.00767 0.3835 0.00953 0.3935 0.1987 [2,] 0.4262 0.00969 0.6345 0.00944 0.6321 0.4276 [3,] 0.6100 0.00956 0.7765 0.00817 0.7769 0.6084 [4,] 0.7358 0.00864 0.8650 0.00670 0.8647 0.7385 [5,] 0.8273 0.00741 0.9219 0.00526 0.9179 0.8282 [6,] 0.8897 0.00614 0.9542 0.00410 0.9502 0.8884 [7,] 0.9273 0.00509 0.9739 0.00312 0.9698 0.9281 [8,] 0.9546 0.00408 0.9835 0.00250 0.9817 0.9540 [9,] 0.9710 0.00329 0.9894 0.00201 0.9889 0.9707 [10,] 0.9814 0.00265 0.9932 0.00161 0.9933 0.9814 ## Very interesting results! Shows clearly that T1 is at least very close to Chi-square 3df distribution, while T2 is at least very close to Chi-square 2df distribution. There MAY be discrepancy in each case at distribution argument 1 (but nowehere else), and even there the most likely explanation may be some fault of the likelihood maximizations which resulted in the smallest values of the paramete a . (20) /* Saved SASsales.asc data as HW20dat, ftp'd to cluster */ libname home "."; data home.HW20; infile"HW20dat"; input Mth Qtr Rep $ Type $ Units Price; Amount = Units*Price; run; NOTE: The data set HOME.HW20 has 110 observations and 7 variables. proc sort data=home.HW20 ; /* could omit the data = option */ by QTR REP; proc means data=home.HW20 N Mean; var amount ; by QTR Rep; run; ------ Qtr=1 Rep=Hollings ----- The MEANS Procedure Analysis Variable : Amount N Mean ----------------- 5 7558.90 ------ Qtr=1 Rep=Jones -------- N Mean ----------------- 5 6451.83 ------ Qtr=1 Rep=Smith -------- N Mean ----------------- 8 9286.86 ------ Qtr=2 Rep=Hollings ----- N Mean ----------------- 6 10314.04 ------ Qtr=2 Rep=Jones ------- N Mean ----------------- 6 8023.23 ------ Qtr=2 Rep=Smith ------- N Mean ----------------- 6 11687.38 ------ Qtr=3 Rep=Hollings ---- N Mean ----------------- 15 4642.82 ------- Qtr=3 Rep=Jones ------ N Mean ----------------- 21 6725.55 ------ Qtr=3 Rep=Smith ------- N Mean ----------------- 21 6886.85 ----- Qtr=4 Rep=Hollings ----- N Mean ----------------- 6 8667.08 ----- Qtr=4 Rep=Jones -------- N Mean ----------------- 6 12245.98 ------ Qtr=4 Rep=Smith ------- N Mean ----------------- 5 7712.67 /* Or alternatively: */ proc tabulate data=home.hw20; class Qtr Rep ; var amount; table Qtr*Rep, amount*mean amount*n; run; The SAS System 11:09 Friday, April 2, 2004 1 ----------------------------------------------------- | | Amount | Amount | | |------------+------------| | | Mean | N | |-------------------------+------------+------------| |Qtr |Rep | | | |------------+------------| | | |1 |Hollings | 7558.90| 5.00| | |------------+------------+------------| | |Jones | 6451.83| 5.00| | |------------+------------+------------| | |Smith | 9286.86| 8.00| |------------+------------+------------+------------| |2 |Hollings | 10314.04| 6.00| | |------------+------------+------------| | |Jones | 8023.23| 6.00| | |------------+------------+------------| | |Smith | 11687.38| 6.00| |------------+------------+------------+------------| |3 |Hollings | 4642.82| 15.00| | |------------+------------+------------| | |Jones | 6725.55| 21.00| | |------------+------------+------------| | |Smith | 6886.85| 21.00| |------------+------------+------------+------------| |4 |Hollings | 8667.08| 6.00| | |------------+------------+------------| | |Jones | 12245.98| 6.00| | |------------+------------+------------| | |Smith | 7712.67| 5.00| ----------------------------------------------------- proc sort data=home.hw20 out=tmp20; by Type Mth Rep; data tmpA (drop=ctr) tmpB (drop=ctr) ; set tmp20; if _N_ = 1 then ctr = 0; if Type EQ "Standard" then do; ctr+1; if ctr LT 16 then output tmpA; end; else output tmpB; proc print data=tmpA; title 'Standard Units Sold, by Mth & Salesrep: 15 cases'; proc print data=tmpB; title 'Deluxe Units Sold, by Mth & Salesrep'; run; ... /* OK, worked ! */ (21) libname home "."; data home.PBC (drop=labels1-labels10); infile "pbcdata.txt"; if _N_=1 then input labels1-labels10 $ ; else input obs idnum dth evttime treatgp logbili agevar cirrh cchol albumin; if obs ne . ; run; .. "Invalid data" errors for the fileds in the blank line!! NOTE: The data set HOME.PBC has 216 observations and 10 variables. proc print; where obs < 6; run; Obs obs idnum dth evttime treatgp logbili agevar cirrh cchol albumin 1 1 180 0 0 1 1.00000 36.598 0 0 37 2 2 188 0 0 0 1.36173 29.964 0 0 35 3 3 56 0 0 1 0.77815 81.451 0 0 36 4 4 19 0 0 0 1.92428 121.510 1 0 41 5 5 190 0 0 1 1.00000 60.340 0 0 32 (ii) proc sort data=home.pbc out=home.altpbc; by descending treatgp; proc means data=home.altpbc mean N ; var agevar; by descending treatgp; run; /* this works, but a better output format is given by proc tabulate: */ proc tabulate data=home.altpbc; class treatgp; var agevar; table treatgp, agevar*mean, agevar*n ; run; proc sort data=home.pbc out=home.altpbc; by descending treatgp; proc means data=home.altpbc mean N ; var agevar; by treatgp; run; ----------------------------------------------------- | | agevar | agevar | | |------------+------------| | | Mean | N | |-------------------------+------------+------------| |treatgp | | | |-------------------------| | | |0 | 55.20| 112.00| |-------------------------+------------+------------| |1 | 51.07| 104.00| ----------------------------------------------------- data tmpavg (drop=agevar); set home.altpbc (keep=agevar treatgp); by descending treatgp; if first.treatgp then do; ctr=0; agemean=0; end; ctr+1; agemean+agevar; if last.treatgp then do; agemean=agemean/ctr; output; end; proc print; run; data tmpavg; set home.altpbc (keep=agevar treatgp); if first.treatgp then do; ctr=0; agemean=0; end; ctr+1; agemean+agevar; if last.treatgp then do; agemean=agemean/ctr; output; end; proc print; run; Obs treatgp ctr agemean 1 1 104 51.0668 2 0 112 55.1976O (iii) proc univariate data=home.pbc; var logbili albumin; proc means data=home.pbc; var logbili albumin; run; /* By default, "means" gives only: N mean stddev min max while "univariate" gives quantiles, extremes , t, kurtosis, you-nasme-it. But both have options (to give more or less output.) */ (iv) proc sort data=home.pbc out=alt2pbc; by dth treatgp cirrh cchol; proc means data=alt2pbc min max; var logbili; by dth treatgp cirrh cchol; run; /* Lots of output in 16 separate segments! Here is the alternative method. */ data tmpminmax (drop= logbili); set alt2pbc (keep= dth treatgp cirrh cchol logbili); by dth treatgp cirrh cchol; retain logbili; if first.cchol then do; minlbil=1000; maxlbil=-1000; end; minlbil=min(minlbil, logbili); maxlbil=max(maxlbil, logbili); if last.cchol then output; run; proc print; run; Obs dth treatgp cirrh cchol minlbil maxlbil 1 0 0 0 0 0.84510 2.07188 2 0 0 0 1 0.84510 2.17319 3 0 0 1 0 1.11394 1.92428 4 0 1 0 0 0.69897 1.96379 5 0 1 0 1 1.74036 2.26951 6 0 1 1 0 1.20412 1.98677 7 1 0 0 0 1.00000 2.46538 8 1 0 0 1 1.47712 2.72263 9 1 0 1 0 1.20412 2.50786 10 1 0 1 1 1.44716 2.46835 11 1 1 0 0 0.77815 2.34635 12 1 1 0 1 1.25527 2.56110 13 1 1 1 0 1.32222 2.03342 14 1 1 1 1 1.71600 2.62634 /* NOTE that not all cases occur !!! */ (22) Splus code to reproduce calculations in BassSAS.txt. ## Getting the data > Bass <- read.table(file="Bass.Dat", skip=35, header=T) > names(Bass) [1] "ID" "Alkalinity" "pH" [4] "Calcium" "Chlorophyll" "AvgMercury" [7] "No.samples" "min" "max" [10] "X3yrStandardMercury" "agedata" ## interesting that the new version of Splus automatically takes the ## name of the lake as the row label > names(Bass) <- c("Id","Alk","pH","Calc","Chlor", "AvMrc","nsamp","min","max","Mrc3yr","agedat") > Bass <- cbind.data.frame(logAvM = log(Bass$AvMrc), Bass) ## Plotting would be accomplished by : > plot(Bass$Alk, Bass$AvMrc) ## etc. > lm1 <- lm(logAvM ~ Alk, data=Bass) NOTE THAT SAS IN CREATING AN OUTPUT FILE FROM PROC REG ALSO INCLUDES ALL OF THE ORIGINAL DATASET'S COLUMNS UNLESS YOU SUPPRESS THEM !! ## predictors: > lm1$fitted[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry -0.4137453 -0.3760587 -2.142617 -0.9397872 -0.3603559 -0.6288729 -0.4027533 ### all OK; and so are the residuals > lm1$resid[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry 0.6207594 0.6612376 -1.076258 0.1188066 0.5426775 -0.6804605 -0.3312158 > Xm <- model.matrix(lm1) > summary(lm1)$cov.un (Intercept) Alk (Intercept) 0.0374267821 -4.945048e-04 Alk -0.0004945048 1.317619e-05 > solve(t(Xm) %*% Xm) (Intercept) Alk (Intercept) 0.0374267821 -4.945048e-04 Alk -0.0004945048 1.317619e-05 > Hat <- Xm %*% summary(lm1)$cov.un %*% t(Xm) ## 53x53 matrix ## Now studentize residual by using standard errors sigma*sqrt(1-h_{ii}) > StudRsd <- (lm1$resid/sqrt(1-diag(Hat)))/summary(lm1)$sigma StudRsd[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry 1.064066 1.134668 -1.913229 0.2022827 0.931661 -1.161048 -0.5679219 ## Now Cook's Distance: > CookD <- (StudRsd)^2*(0.5*diag(Hat)/(1-diag(Hat))) CookD[1:7] Alligator Annie Apopka BlueCypress Brick Bryant Cherry 0.01874503 0.02274477 0.2033591 0.0003944242 0.01575785 0.01594074 0.00544142 ## Finally the confidence limits for mean (CLM) and individual obs (CL) > SE1 <- summary(lm1)$sigma*sqrt(diag(Hat)) SE2 <- summary(lm1)$sigma*sqrt(1-diag(Hat)) rbind(LCLM = lm1$fit[1:6] - qt(.975,51)*SE1[1:6], UCLM = lm1$fit[1:6] + qt(.975,51)*SE1[1:6], LCL = lm1$fit[1:6] - qt(.975,51)*SE2[1:6], UCL = lm1$fit[1:6] + qt(.975,51)*SE2[1:6]) Alligator Annie Apopka BlueCypress Brick Bryant LCLM -0.6268623 -0.5959707 -2.519064 -1.1035042 -0.5831806 -0.8098173 UCLM -0.2006282 -0.1561466 -1.766171 -0.7760702 -0.1375313 -0.4479284 LCL -1.5849385 -1.5459950 -3.271954 -2.1189009 -1.5297410 -1.8054660 UCL 0.7574479 0.7938777 -1.013281 0.2393265 0.8090292 0.5477203 > anova(lm1) ### Same info as in main SAS table Analysis of Variance Table Df Sum of Sq Mean Sq F Value Pr(F) Alk 1 18.71377 18.71377 53.22362 1.859181e-09 Residuals 51 17.93193 0.35161 > summary(lm1)$coef Value Std. Error t value Pr(>|t|) (Intercept) -0.32109907 0.114714868 -2.799106 7.216728e-03 Alk -0.01570274 0.002152402 -7.295452 1.859181e-09 Parameter Standard Variable DF Estimate Error t Value Pr > |t| Intercept 1 -0.32110 0.11471 -2.80 0.0072 Alk 1 -0.01570 0.00215 -7.30 <.0001 ### Next a plot (omitted) includng fitted line, actual points and CL's and CLM's. ## Next comes the summary statistics for CookD, logMrsd, Studnt : > round(c(quantile(CookD,prob=c(0,.05,.25,.5,.75,.95,1)), sqrt(var(CookD))),4) 0% 5% 25% 50% 75% 95% 100% 0 1e-04 0.0018 0.005 0.0178 0.1185 0.2637 0.0491 > round(c(quantile(lm1$resid,prob=c(0,.05,.25,.5,.75,.95,1)), sqrt(var(CookD))),4) 0% 5% 25% 50% 75% 95% 100% -2.0655 -0.8741 -0.2795 0.0822 0.2923 0.7118 1.792 0.0491 > round(c(quantile(StudRsd,prob=c(0,.05,.25,.5,.75,.95,1)), sqrt(var(StudRsd))),4) 0% 5% 25% 50% 75% 95% 100% -3.5224 -1.4911 -0.4774 0.1409 0.5029 1.2174 3.1037 1.011 > inds <- union((1:53)[CookD > 0.140 | abs(lm1$resid) > 1 | abs(StudRsd) > 1.5], c(36,52)) > inds [1] 3 18 36 38 40 52 > cbind(data.matrix(Bass[inds,1:3]), logMrsd=lm1$resid[inds], logMprd= lm1$fit[inds], CookD=CookD[inds], Studnt = StudRsd[inds]) logAvM Id Alk logMrsd logMprd CookD Studnt Apopka -3.21887582 3 116.0 -1.0762584 -2.1426174 0.20335906 -1.913229 Harney -0.26136476 18 61.3 1.0223125 -1.2836773 0.04124808 1.747211 Orange -1.71479843 36 25.4 -0.9948497 -0.7199488 0.03054176 -1.695488 Parker -3.21887582 38 53.0 -2.0655313 -1.1533445 0.13968778 -3.522399 Puzzle 0.09531018 40 87.6 1.7919696 -1.6966595 0.26365592 3.103668 Wildcat -1.38629436 52 17.3 -0.7935378 -0.5927565 0.02281806 -1.354791 > summary(lm1)$r.sq ### = 0.5106676 lm2 <- update(lm1, data=Bass[setdiff(1:53,inds),]) > summary(lm2)$r.sq [1] 0.7197122 > rbind(lm1$coef,lm2$coef) (Intercept) Alk [1,] -0.3210991 -0.01570274 [2,] -0.2656866 -0.01600271 ## Now produce Low and Hi predictors at "outlying" points, from model defined on 47 non-`outlying' points: > c(model.matrix(lm1)[inds,] %*% lm2$coef) + outer(sqrt(1 + diag(Xm[inds,] %*% summary(lm2)$cov.un %*% t(Xm[inds,]))), c(-1,1))* summary(lm2)$sigma*qt(.975,45) [,1] [,2] [1,] -2.934843 -1.3091579 [2,] -2.026140 -0.4671655 [3,] -1.448031 0.1037205 [4,] -1.891160 -0.3365000 [5,] -2.458992 -0.8760555 [6,] -1.319657 0.2345897 #### Not exactly the values ### previously produced, but close, and reasonable ... > cbind(data.matrix(Bass[inds,1:3]), fit1=lm1$fit[inds], + fit2=c(model.matrix(lm1)[inds,] %*% lm2$coef)) logAvM Id Alk fit1 fit2 Apopka -3.21887582 3 116.0 -2.1426174 -2.1220007 Harney -0.26136476 18 61.3 -1.2836773 -1.2466526 Orange -1.71479843 36 25.4 -0.7199488 -0.6721554 Parker -3.21887582 38 53.0 -1.1533445 -1.1138301 Puzzle 0.09531018 40 87.6 -1.6966595 -1.6675238 Wildcat -1.38629436 52 17.3 -0.5927565 -0.5425335 Further illustration related to Stepwise model-selection ! proc reg data=bass2; model logAvM = Alk pH Calc Chlor / selection = forward best=2 ; run; Summary of Forward Selection Variable Number Partial Model Step Entered Vars In R-Square R-Square C(p) F-Val Pr>F 1 Alk 1 0.5107 0.5107 29.7014 53.22 <.0001 2 Chlor 2 0.1734 0.6841 3.8085 27.45 <.0001 3 Calc 3 0.0154 0.6995 3.3266 2.52 0.1191 (25) > hist(geyser$waiting, nclass=20, prob=T, xlab="Waiting Times", ylab="Scaled Relative Frequency", main= "Geyser Waiting Times, Histogram & Overplotted Density Estimates") > > gpts <- seq(min(geyser$waiting), max(geyser$waiting), length=50) kerMatg1 <- outer(gpts, geyser$waiting, function(x,y) dlogis(x,y,.5)) kerMatg2 <- outer(gpts, geyser$waiting, function(x,y) dlogis(x,y,3)) lines(gpts, c(kerMatg1 %*% rep(1/299,299)), lty=1) lines(gpts, c(kerMatg2 %*% rep(1/299,299)), lty=4) legend(locator(), legend=c("Bandwidth .5", "Bandwidth 3"), lty=c(1,4)) > lLk2 <- function(x, pwt, mn, scal, dens = dnorm) { dnsmat <- matrix(dens(outer(mn,x,"-")/scal)/scal, ncol=length(x)) sum(log(pwt %*% dnsmat)) } > tmpft <- nlmin(function(w) -lLk2(log(geyser$waiting), c(plogis(w[1]), 1-plogis(w[1])), w[2:3], exp(w[4:5])), c(0.5,3.95,4.35,log(.2),log(.2)), print.level=1, max.fcal=300, max.iter=100) > tmpft $x: [1] -0.7073344 4.0009645 4.3892868 -2.3041704 -2.4458907 $converged: [1] T $conv.type: [1] "relative function convergence" ### Thus we have convergence to a mixture of two normals (for the log-waiting times) with proportions 0.3302, .6698 of |N(4.0010, 0.0998^2) and |N(4.3893, 0.0866^2) ### Now let's try a mixture of three normals: > tmpft3 <- nlmin(function(w) -lLk2(log(geyser$waiting), c(plogis(w[1]), plogis(w[2])*(1-plogis(w[1])), (1-plogis(w[1]))* (1-plogis(w[2]))), w[3:5], exp(w[6:8])), c(0.5,0.5,3.95,4.15,4.35, rep(log(.2),3)), print.level=1, max.fcal=300, max.iter=100) > tmpft3 $x: [1] -1.2087436 -0.9080385 3.9655844 4.2238913 4.3998657 -2.5495674 [7] -1.7914989 -2.5778582 $converged: [1] T $conv.type: [1] "relative function convergence" ### This gives normal with probabilty weights > c(plogis(tmpft3$x[1]),(1-plogis(tmpft3$x[1]))*plogis(tmpft3$x[2]), (1-plogis(tmpft3$x[1]))*(1-plogis(tmpft3$x[2]))) [1] 0.2299234 0.2213211 0.5487555 ### and means and scale parameters respectively given by > rbind(means=tmpft3$x[3:5], sdev=exp(tmpft3$x[6:8])) [,1] [,2] [,3] means 3.96558439 4.2238913 4.39986566 sdev 0.07811545 0.1667101 0.07593647 > hist(geyser$waiting, nclass=20, prob=T, xlab="Waiting Times", ylab="Scaled Relative Frequency", main= "Geyser Waiting Times, Histogram & Overplotted Density Estimates") ### We overplot the two and three component log-normals: > fmix function(y, pwt, mn, scal) { prb <- if(length(pwt) == 2) c(plogis(pwt[1]), (1 - plogis(pwt[ 1])) * plogis(pwt[2]), (1 - plogis(pwt[1])) * (1 - plogis( pwt[2]))) else c(plogis(pwt[1]), (1 - plogis(pwt[1]))) c(prb%*% matrix(dnorm(outer(mn,log(y),"-")/scal)/scal, ncol=length(y)))/y } > lines(seq(43,108,.5), fmix(seq(43,108,.5),tmpft$x[1], tmpft$x[2:3], exp(tmpft$x[4:5])), lty=1) > lines(seq(43,108,.5), fmix(seq(43,108,.5),tmpft3$x[1:2], tmpft3$x[3:5], exp(tmpft3$x[6:8])), lty=3) ### To compare the fit done both ways: ### logLik for 2-component and for 3-component models: > sum(log(fmix(geyser$waiting,tmpft$x[1], tmpft$x[2:3], exp(tmpft$x[4:5])))) [1] -1155.923 > sum(log(fmix(geyser$waiting,tmpft3$x[1:2], tmpft3$x[3:5], exp(tmpft3$x[6:8])))) [1] -1154.563 ### So the fit is only a tiny bit `better' with the third component, ### definitely not enough of an improvement over 2-component ### to be worth bothering with ! (26) > library(MASS) > rubberlm <- lm(loss ~ ., data=Rubber) > rubberlm$coef (Intercept) hard tens 885.1611 -6.57083 -1.374312 ### The simplest rule based on the linear regression ### is to predict each (hard,tens) observation as above-median ### if 885.1611-6.57083*hard -1.374312*tens > 165. ### One the original data this yields > table(rubberlm$fit > 165, Rubber$loss > 165) FALSE TRUE FALSE 14 1 TRUE 1 14 ### so only 2 are misclassified. ## The second method is logistic regression: > Above<- as.numeric(Rubber$loss > 165) rubberlgs <- glm(cbind(Above, 1-Above) ~ hard+tens, data=Rubber, family=binomial) > rubberlgs$coef (Intercept) hard tens 57.24511 -0.5125454 -0.1120316 > table(rubberlgs$fit > 0.5, Rubber$loss > 165) FALSE TRUE FALSE 14 2 TRUE 1 13 ### now 3 are misclassified ## Third method: predict based on kernel NPR value > 165 > kerRubber <- outer(rubberlm$fit,rubberlm$fit, function(x,y) dlogis(x,y,40)) ## 30x30 matrix: kerRubfit <- c(kerRubber %*% Rubber$loss)/c(kerRubber %*% rep(1,30)) > table(kerRubfit > 165, Rubber$loss > 165) FALSE TRUE FALSE 14 3 TRUE 1 12 ### now 4 are misclassified > RubbAlt <- cbind.data.frame(Rubber, binvec= as.numeric(Rubber$loss > 165)) > Prdct3 <- function(trainfram,testfram) { trainlm <- lm(loss ~ hard+tens, data=trainfram) testlmfit <- trainlm$coef[1]+c(data.matrix(testfram[,2:3]) %*% trainlm$coef[2:3]) trainlgs <- glm(cbind(binvec, 1-binvec) ~ hard+tens, data=trainfram, family=binomial) testlgsc <- trainlgs$coef[1]+c(data.matrix(testfram[,2:3]) %*% trainlgs$coef[2:3]) kertrain <- outer(testlmfit, trainlm$fit, function(x,y) dlogis(x,y,40)) kertest <- c(kertrain %*% trainfram$loss)/c(kertrain %*% rep(1,nrow(trainfram))) list(lm.mis = sum( (testlmfit > 165) != (testfram$loss > 165)), glm.mis = sum( (testlgsc>0) != (testfram$loss > 165)), ker.mis = sum( (kertest > 165) != (testfram$loss > 165))) } > unlist(Prdct3(RubbAlt,RubbAlt)) lm.mis glm.mis ker.mis 2 3 4 ## Reproduces previous calculation ### This completes part (i). NOTE that these estimates of mis- ### classification rates out of 30 will turn out to be too low! NOTE: it turns out in further testing with this function that -- as pointed out to me by Sam Younkin -- there are several cases of cross-validation training samples in which the logistic fit is "unstable": these are fits in which the (not-quite-converged) models are trying to fit essentially 0-or-1 predictions but where there are still some wrongly-predicted cases! There were 50 warnings (use warnings() to see them) > Misclass <- array(0, dim=c(1000,3), dimnames=list(NULL, c("lm.mis","glm.mis","ker.mis"))) for(i in 1:1000) { lvout <- sample(30,5) lvin <- setdiff(1:30,lvout) Misclass[i,] <- unlist(Prdct4(RubbAlt[lvin,],RubbAlt[lvout,]))} > apply(Misclass,2,sum)/5000 lm.mis glm.mis ker.mis 0.1148 0.1142 0.1264 ### These are the estimated misclassifi- ### cation rates, suggesting that "linear" works best for these ### data and that roughly one in 8 points are missclassified. ### (Actually, we would need a considerably larger simulation ### study to confirm this, and that is constrained by the remark ### that {30 choose 5} = 142506 . *** Now for bootstraps (part (iii)). The basis for this is that we have 3 `statistics' (to be spelled out now) for estimation of P(loss > 165 | hard, tens) based on a given data-set (data frame with columns loss, hard, tens). The statistics are: (a) from normal linear regression: 1 - Phi ((165 - b0 - b1*hard - b2*tens)/sigmahat) (b) from logistic regression: plogis( c0 + c1*hard + c2*tens ) (c) from kernel method (by analogy with NPR approach) sum_i I[loss_i > 165] * ker((x- b0 - b1*hard_i - b2*tens_i)/40)/ sum_i ker((x- b0 - b1*hard_i - b2*tens_i)/40) These are the statistics we bootstrap to get confidence intervals. > Estim3 <- function(evalfram, bootfram) { bootlm <- lm(loss ~ hard+tens, data=bootfram) evallmfit <- bootlm$coef[1]+c(data.matrix(evalfram[,2:3]) %*% bootlm$coef[2:3]) lmest <- 1-pnorm((165 - evallmfit)/summary(bootlm)$sigma) ## binvec <- as.numeric(bootfram$loss > 165) bootlgs <- glm(cbind(binvec, 1-binvec) ~ hard+tens, data=bootfram, family=binomial) glmest <- plogis(bootlgs$coef[1]+c(data.matrix(evalfram[,2:3]) %*% bootlgs$coef[2:3])) kerboot <- outer(evallmfit, bootlm$fit, function(x,y) dlogis(x,y,40)) kerest <- c(kerboot %*% bootfram$binvec)/c(kerboot %*% rep(1,nrow(bootfram))) list(lm.est = lmest, glm.est =glmest, ker.est = kerest) } > Estim3(RubbAlt[1:5,],RubbAlt) $lm.est: [1] 0.99999998 0.85462905 0.50546810 0.19793889 0.04011242 $glm.est: [1] 0.999999891 0.950345758 0.497125366 0.078551388 0.006529196 $ker.est: [1] 0.9593606 0.6186340 0.4408652 0.3167944 0.2251193 ### This completes the definition of the statistics. Now we bootstrap, ### which means that the bootfram data will be generated by ### the different with-replacement (nonparametric bootstrap) ### resampling strategies, while evalfram will always be Rubber[1:5,]. > Boot2mat <- Boot1mat <- array(0, dim=c(4000,5,3)) > for (i in 1:4000) Boot1mat[i,,] <- matrix(unlist(Estim3(RubbAlt[1:5,], RubbAlt[ sample(30,30, replace=T),])), ncol=3) ### This was the simplest Bootstrap strategy: pure with-replacement ### sampling of triples. ### Next consider what is probably the most appealing method for ### bootstrapping a dataset which approximately satisfies a ### linear regression model with additive iid errors. We begin ### by setting up the linear regression & residuals from the ### original data, then bootstrapping (hard,tens) with replacement ### and independently bootstrapping residuals with replacement. > resids <- rubberlm$resid yhats <- rubberlm$fit > for (i in 1:4000) { inds1 <- sample(30,30,replace=T) inds2 <- sample(30,30,replace=T) Boot2mat[i,,] <- matrix(unlist(Estim3(RubbAlt[1:5,], cbind.data.frame(loss=yhats[inds1]+resids[inds2], RubbAlt[inds1,2:4]))), ncol=4) } ### In each of these loops, there were warnings (of exactly the ### same type as in the cross-validation steps) from the logistic- ### regression glm function. > M1 <- round( apply(Boot1mat,2:3, function(x) quantile(x,.025)),4) [,1] [,2] [,3] [1,] 1.0000 1.0000 0.8583 [2,] 0.6636 0.5107 0.4253 [3,] 0.2621 0.0000 0.2609 [4,] 0.0574 0.0000 0.1600 [5,] 0.0033 0.0000 0.0959 > M2 <- round( apply(Boot1mat,2:3, function(x) quantile(x,.975)),4) [,1] [,2] [,3] [1,] 1.0000 1.0000 0.9868 [2,] 0.9680 1.0000 0.7747 [3,] 0.6986 1.0000 0.6166 [4,] 0.3266 0.5820 0.4934 [5,] 0.0898 0.0686 0.3942 > M3 <- round( apply(Boot2mat,2:3, function(x) quantile(x,.025)),4) [,1] [,2] [,3] [1,] 1.0000 0.9999 0.8604 [2,] 0.6324 0.5483 0.3842 [3,] 0.2482 0.0000 0.2410 [4,] 0.0445 0.0000 0.1521 [5,] 0.0022 0.0000 0.0921 > M4 <- round( apply(Boot2mat,2:3, function(x) quantile(x,.975)),4) [,1] [,2] [,3] [1,] 1.0000 1.0000 0.9882 [2,] 0.9836 1.0000 0.8174 [3,] 0.7670 1.0000 0.6655 [4,] 0.3935 0.6294 0.5291 [5,] 0.1183 0.0698 0.4147 ### Still have to make confidence intervals out of these quantiles and the estimated quantities above (from the original dataset.) ### Of course, instead of bootstrapping the residuals we could have taken the parametric approach of simulating them as normally distributed with means 0 and variance summary(rubberlm)$sigma^2, but that is very similar and less appealing ! OK, for constructing CI's we focus on the function of the data which results in the estimated probability P(loss > 165 | hard, tens). Unfortunately, there is no nonparametric functional which gives a consistent estimator, and this is not only a problem of the small sample size (30), but also a problem of the type of (conditional!) probability we are estimating. The main issue is that in a nonparametric context, we have only the observations precisely at the specific (hard,tens) value to help us estimate this conditional probability. *** So I sent you all a long message indicating that the functionals T that we are estimating are those indicated above (the one based on normal linear regression, the one based on logistic regression, and the nonparametric kernel regression using the linear combination of the hard and tens variables estimated from linear *** regression. ### We complete this analysis by generating an array of confidence inteervals, as follows: 5 points by 3 estimators by 2 bootstrap methods by (lower,upper) CI endpoints. > Rubber[1:5,2:3] hard tens 1 45 162 2 55 233 3 61 232 4 66 231 5 71 231 > CIarr <- array(0, dim=c(5,3,2,2), dimnames=list(paste("Pt", 1:5,sep=""), c("lin.reg","lgst.reg","ker.NPR"), c("Lower","Upper"), c("Bootstr.a", "Bootstr.c"))) > Ests <- matrix(unlist(Estim3(RubbAlt[1:5,],RubbAlt)), ncol=3) > Ests [,1] [,2] [,3] [1,] 0.99999998 0.999999891 0.9593606 [2,] 0.85462905 0.950345758 0.6186340 [3,] 0.50546810 0.497125366 0.4408652 [4,] 0.19793889 0.078551388 0.3167944 [5,] 0.04011242 0.006529196 0.2251193 > CIarr[,,1,1] <- 2*Ests- M2 ### This is the nonparametric CIarr[,,2,1] <- 2*Ests- M1 ### bootstrap form of CI's CIarr[,,1,2] <- 2*Ests- M4 CIarr[,,2,2] <- 2*Ests- M3 > round(pmin(pmax(CIarr,0),1),3) , , Lower, Bootstr.a lin.reg lgst.reg ker.NPR Pt1 1.000 1.000 0.932 Pt2 0.741 0.901 0.463 Pt3 0.312 0.000 0.265 Pt4 0.069 0.000 0.140 Pt5 0.000 0.000 0.056 , , Upper, Bootstr.a lin.reg lgst.reg ker.NPR Pt1 1.000 1.000 1.000 Pt2 1.000 1.000 0.812 Pt3 0.749 0.994 0.621 Pt4 0.338 0.157 0.474 Pt5 0.077 0.013 0.354 , , Lower, Bootstr.c lin.reg lgst.reg ker.NPR Pt1 1.000 1.000 0.931 Pt2 0.726 0.901 0.420 Pt3 0.244 0.000 0.216 Pt4 0.002 0.000 0.104 Pt5 0.000 0.000 0.036 , , Upper, Bootstr.c lin.reg lgst.reg ker.NPR Pt1 1.000 1.000 1.000 Pt2 1.000 1.000 0.853 Pt3 0.763 0.994 0.641 Pt4 0.351 0.157 0.481 Pt5 0.078 0.013 0.358