LOG TO SUMMARIZE SOME MCMC Examples =================================== 10/5/09ff ## Begin with combinatorial example: to simulate ## a random 2-way contingency table with fixed marginals ## Start by generating a single table and set of marginals. Xarr = array(sample(0:3, 35, rep=T), c(5,7)) > Xarr [,1] [,2] [,3] [,4] [,5] [,6] [,7] [1,] 0 3 1 1 1 2 0 [2,] 1 0 1 3 2 2 0 [3,] 3 3 2 1 1 1 3 [4,] 1 1 1 1 1 3 1 [5,] 2 1 3 0 2 2 0 for(k in 1:2) assign(paste("Marg",k,sep=""), apply(Xarr,k,sum)) > Marg1 [1] 8 9 14 9 10 ### sum = 50 > Marg2 [1] 7 8 8 6 7 10 4 ### Now Xarr is one feasible table with these marginals, and we ### create a Markov chain transition to take us from one such ### table to another. ### Idea: look at all row-column pairs (i,j) and (ip,jp) ### with i!=ip and j!=jp and Xarr positive at both, and ### randomly select one, changing to Xarr2 with (i,j), (ip,jp) ### values one smaller and (i,jp) and (ip,j) one larger than Xarr. BigMat = array(1,c(35,35)) for(i in 1:5) for (j in 1:7) for(ip in 1:5) for (jp in 1:7) if(i==ip | j==jp) BigMat[i+(j-1)*5,ip+(jp-1)*5]=0 > sum(BigMat) [1] 840 ### compare with 35^2 = 1225 CountPrs = function(Xarr) { indv = c(ifelse(Xarr>0,1,0)) sum(outer(indv,indv)*BigMat) } > CountPrs(Xarr) [1] 570 qTrans = function(Xarr) { Xnew = Xarr indv = c(ifelse(Xarr>0,1,0)) kpr = sample((1:35^2)[outer(indv,indv)*BigMat > 0],1)-1 k1 = kpr %% 35 ; k2 = kpr%/%35 i = 1 + (k1%%5); j = 1 + (k1%/%5) ip = 1 + (k2%%5); jp = 1 + (k2%/%5) Xnew[i,j]=Xarr[i,j]-1 Xnew[ip,jp]=Xarr[ip,jp]-1 Xnew[ip,j]=Xarr[ip,j]+1 Xnew[i,jp]=Xarr[i,jp]+1 Xnew } > qTrans(Xarr)-Xarr [,1] [,2] [,3] [,4] [,5] [,6] [,7] [1,] 0 0 0 0 0 0 0 [2,] 0 0 0 0 0 0 0 [3,] 0 0 0 0 0 0 0 [4,] -1 1 0 0 0 0 0 [5,] 1 -1 0 0 0 0 0 > qTrans(Xarr)-Xarr [,1] [,2] [,3] [,4] [,5] [,6] [,7] [1,] 0 0 0 0 0 0 0 [2,] -1 0 0 0 0 0 1 [3,] 1 0 0 0 0 0 -1 [4,] 0 0 0 0 0 0 0 [5,] 0 0 0 0 0 0 0 ### Looks OK ... ### Now to code MCMC: note that equilibrium dist f ### has const density although we do not know ### what that constant is ... MCMCtab = function(Nrep,X0) { outarr = array(0, c(Nrep,dim(X0))) Xold=X0 count.old = CountPrs(Xold) Uvec = runif(Nrep) for(i in 1:Nrep) { Xnew = qTrans(Xold) count.new = CountPrs(Xnew) if( Uvec[i] < count.old/count.new ) { count.old=count.new Xold = Xnew outarr[i,,] = Xold } } ## Note that we do not record repeats ! outarr } > tmpout = MCMCtab(100,X0) > table(apply(tmpout,1,sum) > 0) FALSE TRUE 3 97 tmpout[100,,]-Xarr ### Now want to figure out how many distinct tables ### there are: so use "hash coding idea and create ### a vector consisting of the distinct linear ### combinations with a fixed matrix of Unif's > UnifTab = array(runif(35), c(5,7)) HashSeq = numeric(1e4 + 1) HashSeq[1] = sum(Xarr*UnifTab) indpos = (1:100)[apply(tmpout,1,sum) > 0] HashSeq[1+indpos] = c(array(tmpout[indpos,,], c(length(indpos),35)) %*% c(UnifTab)) > which(HashSeq[1:101]==0) [1] 19 36 93 > which(duplicated(HashSeq[1:101])) [1] 36 54 93 ### by chance, items 54 and 52 are identical > for(i in 2:100) { Xold = tmpout[max(which(HashSeq[(i-2)*100+(2:101)] > 0)),,] tmpout = MCMCtab(100,Xold) indpos = (1:100)[apply(tmpout,1,sum) > 0] HashSeq[1+(i-1)*100+indpos] = c(array(tmpout[indpos,,], c(length(indpos),35)) %*% c(UnifTab)) } > sum(HashSeq[1:10001] == 0) [1] 393 > sum(duplicated(HashSeq[HashSeq > 0])) [1] 73 ### so 10001-393-73 are unique HashSeq = c(HashSeq,rep(0,1e5-1e4)) ## now length 100001 for(i in 101:1000) { Xold = tmpout[max(which(HashSeq[(i-2)*100+(2:101)] > 0)),,] tmpout = MCMCtab(100,Xold) indpos = (1:100)[apply(tmpout,1,sum) > 0] HashSeq[1+(i-1)*100+indpos] = c(array(tmpout[indpos,,], c(length(indpos),35)) %*% c(UnifTab)) } > sum(HashSeq > 0) [1] 96570 > sum(duplicated(HashSeq[HashSeq>0])) [1] 577 ### so we have visited 96570-577 distinct tables !! HashSeq = c(HashSeq, rep(0,1e5)) unix.time({ for(i in 1001:2000) { Xold = tmpout[max(which(HashSeq[(i-2)*100+(2:101)] > 0)),,] tmpout = MCMCtab(100,Xold) indpos = (1:100)[apply(tmpout,1,sum) > 0] HashSeq[1+(i-1)*100+indpos] = c(array(tmpout[indpos,,], c(length(indpos),35)) %*% c(UnifTab)) } }) user system elapsed 217.75 0.14 228.86 ### almost 4 minutes for 1e5 new tables! > sum(HashSeq > 0) [1] 192974 > sum(duplicated(HashSeq[HashSeq>0])) [1] 1210 ### Still almost all new tables !! ------------------------------------------------------------ PRELIMINARY TESTING OF CONVERGENCE FOR THE MCMC SIMULATION JUST DESCRIBED. > summary(HashSeq) ### 2e5 simulated in 2 batches of 1e5 Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 27.27 28.38 27.47 29.48 34.84 > cor(HashSeq[1:1e5],HashSeq[2:(1e5+1)]) [1] 0.07723625 > cor(HashSeq[1:1e5],HashSeq[(1e5+1):2e5]) [1] 0.002001812 ### OK: dependence is allowed for ### successive indices, but INDEPENDENCE and ### IDENTICAL DISTRIBUTION should appear over large lags > tmppac = pacf(HashSeq[1:1e5]) ### shows steady decay from .08 down ### to fluctuating values from .01 down to .002 ### over lags 1:50 > tmppac2 = pacf(HashSeq[1e5+(1:1e5)]) ### very similar in the second half ### Now create simulated 35-vectors of length 1000 ### by sampling every 100th output table. TabSeq = array(0, c(1000,5,7)) Hash = HashSeq[2e5-100+(2:101)] Xold = tmpout[max(which(Hash > 0)),,] for(i in 1:1000) { tmpout = MCMCtab(100,Xold) indpos = (apply(tmpout,1,sum) > 0) Hash = replace(rep(0,100),indpos, c(array(tmpout[indpos,,], c(sum(indpos),35)) %*% c(UnifTab))) Xold = tmpout[max(which(Hash > 0)),,] TabSeq[i,,] = Xold } > tmpv = apply(TabSeq[,,7],1,max) > table(tmpv[1:999],tmpv[2:1000]) 1 2 3 4 1 6 52 21 7 2 53 347 148 39 3 20 147 72 19 4 7 40 18 3 > chisq.test(table(tmpv[1:999],tmpv[2:1000])) Pearson's Chi-squared test data: table(tmpv[1:999], tmpv[2:1000]) X-squared = 2.5512, df = 9, p-value = 0.9795 ### OK !! now do some qick checks to see that all ### of the possible rows and columns of the table ### (one at a time) are sampled fully ... length(unique(TabSeq[,,7] %*% runif(5))) [1] 70 ### this is the right number !! > table(apply(TabSeq[,,6],1,max)) ## marg total=10 2 3 4 5 6 7 8 9 10 1 134 328 285 150 69 23 9 1 ### OK length(unique(TabSeq[,,6] %*% runif(5))) [1] 543 ### actual max is: > rbind(table(TabSeq[1:500,1,6]), table(TabSeq[501:1000,1,6])) 0 1 2 3 4 5 6 7 [1,] 155 122 96 66 39 15 6 1 [2,] 153 135 83 57 41 25 4 2 ### NB. If we want to estimate the true max number ### of possible tables AND we believe that the ### two previous batches of 1e5 were each samples ### of the equilibrium distribution, then we could ### estimate the total roughly, as follows: Uniques in first 1e5 batch: 95992 Uniques in 2nd 1e5 batch: 95773 Uniques visited in both batches: > length(intersect(unique(HashSeq[1:1e5]), unique(HashSeq[1e5+(1:1e5)]))) [1] 3 So while for many purposes we may see equilibrium behavior, we do not sample all possibilities by any means: how could we check convergence ??? arrind = cbind(rep(2:3,c(3,3)),rep(3:5,2)) midsum = function(matr) sum(matr[arrind]) ### this is the sum of the following array #### of table entries: > arrind [,1] [,2] [1,] 2 3 [2,] 2 4 [3,] 2 5 [4,] 3 3 [5,] 3 4 [6,] 3 5 > midsum(TabSeq[77,,]) [1] 7 > TabSeq[77,,] [,1] [,2] [,3] [,4] [,5] [,6] [,7] [1,] 0 0 1 4 1 2 0 [2,] 0 4 1 0 0 4 0 [3,] 4 0 4 2 0 1 3 [4,] 0 1 2 0 4 1 1 [5,] 3 3 0 0 2 2 0 > table(apply(TabSeq[1:500,,],1, midsum)) 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2 2 7 17 31 57 70 86 75 59 40 32 12 5 4 1 > table(apply(TabSeq[501:1000,,],1, midsum)) 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2 6 14 29 51 79 67 82 66 48 25 18 8 3 2 ### Looks OK !!! Let's group into cells: <6, 6, 7, ...,14, >14 > vec1 = hist(apply(TabSeq[1:500,,],1, midsum), breaks=c(0,5.5:14.5,18), plot=F)$count vec2 = hist(apply(TabSeq[501:1000,,],1, midsum), breaks=c(0,5.5:14.5,18), plot=F)$count > sum((vec1-vec2)^2/(vec1+vec2)) [1] 7.90543 ### fine for 2-sample chisq, df=10 ### Try one more exhibit: table row-column chisq stat > statSeq = apply(TabSeq,1, function(matr) chisq.test(matr)$stat) ### looks very noisy and erratic when plotted in time sequence, BUT > pacf(statSeq) ### looks significant at lag 4 and 6 !! TabSeq2 = array(0, c(1000,5,7)) for(i in 1:1000) { tmpout = MCMCtab(100,Xold) indpos = (apply(tmpout,1,sum) > 0) Hash = replace(rep(0,100),indpos, c(array(tmpout[indpos,,], c(sum(indpos),35)) %*% c(UnifTab))) Xold = tmpout[max(which(Hash > 0)),,] TabSeq2[i,,] = Xold } > statSeq2 = apply(TabSeq2,1, function(matr) chisq.test(matr)$stat) pacf(statSeq2) ### now the only pacf that looks a little bit significant ### is at lag 17, so it seems very likely that ### none of these apparent effects is real ! &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& Try a little further testing based on coding of metric for concentration of table entries: > Xarr [,1] [,2] [,3] [,4] [,5] [,6] [,7] [1,] 0 3 1 1 1 2 0 [2,] 1 0 1 3 2 2 0 [3,] 3 3 2 1 1 1 3 [4,] 1 1 1 1 1 3 1 [5,] 2 1 3 0 2 2 0 > tmpout = MCMCtab(1000,Xarr) > dim(tmpout) [1] 1000 5 7 > tmpout = MCMCtab(1000,tmpout[1000,,]) tvec = apply(tmpout,1,function(tab) = var(c(tab))) > acf(tvec,lag.max=50) ### looks pretty good > spectrum(tvec, method="ar") > tmpout = MCMCtab(1000,tmpout[1000,,]) > spectrum(tvec, method="ar") > tmpout = MCMCtab(1000,tmpout[1000,,]) > spectrum(tvec, method="ar") #### Looks really similar each time ! >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> HW Problem based on MCMC sampling of uniform point in R^7 simul- taneously satisfying 15 specified feasible linear inequalities >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> CONSIDER SOME MUCH SIMPLER EXAMPLES FOLLOWING KEDEM (1999) (a) MCMC for Generating N(0,1) Data From DoubExp random walk ## We want to simulate X_t ~ N(0,1), i.e., f(x) = dnorm(x), ## with proposal kernel q giving iid double exponential ## transition-steps, q(y|x) = .5*exp(-abs(y-x)), NB: symmetric ! unix.time({ DEseq = sample(c(-1,1), 1e5, rep=T)*rexp(1e5) Useq = runif(1e5) ### initial Y0=0 Yseq = numeric(1e5+1) for(i in 1:1e5) { new=Yseq[i]+DEseq[i] Yseq[i+1] = if(Useq[i] hist(Yseq[1+(1:1e5)], nclass=50, prob=T) curve(dnorm(x), add=T, lty=3) ### perfect ! > sum(duplicated(Yseq)) [1] 33492 ### This is the number of times, ### roughly 1 in 3 where no new deviate is generated. > acf(Yseq[!duplicated(Yseq)]) ### dependence dies away completely by lag 10 (b) Now re-do MCMC with proposal kernel giving iid DoubExp ### that is, use same f but nonsymmetric q(y|x) = .5*exp(-abs(y)) unix.time({ DEseq = sample(c(-1,1), 1e5, rep=T)*rexp(1e5) Useq = runif(1e5) ### initial Y0=0 Yseq = numeric(1e5+1) for(i in 1:1e5) { new=DEseq[i] Yseq[i+1] = if(Useq[i] hist(Yseq[1+(1:1e5)], nclass=50, prob=T) curve(dnorm(x), add=T, lty=3) ### again perfect ! > qqnorm(Yseq[2:(1e5+1)]) ### But in the second method, distinct results are iid ### (lag >= 1 shows no dependence in: acf(Yseq[!duplicated(Yseq)]) ### thus the second method exactly coincides with accept-reject. > sum(duplicated(Yseq)) [1] 16422 ### far fewer "wasted" repeats