LOG TO SUMMARIZE SOME MCMC Examples 
=================================== 10/5/09ff

## Begin with combinatorial example: to simulate
##  a random 2-way contingency table with fixed marginals
## Start by generating a single table and set of marginals.

Xarr = array(sample(0:3, 35, rep=T), c(5,7))
> Xarr
     [,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,]    0    3    1    1    1    2    0
[2,]    1    0    1    3    2    2    0
[3,]    3    3    2    1    1    1    3
[4,]    1    1    1    1    1    3    1
[5,]    2    1    3    0    2    2    0
for(k in 1:2) assign(paste("Marg",k,sep=""), apply(Xarr,k,sum))
> Marg1
[1]  8  9 14  9 10                 ### sum = 50
> Marg2
[1]  7  8  8  6  7 10  4

### Now Xarr is one feasible table with these marginals, and we
###   create a Markov chain transition to take us from one such 
###   table to another.
### Idea: look at all row-column pairs (i,j) and (ip,jp)
###   with i!=ip and j!=jp and Xarr positive at both, and 
###   randomly select one, changing to Xarr2 with (i,j), (ip,jp) 
###   values one smaller and (i,jp) and (ip,j) one larger than Xarr.

BigMat = array(1,c(35,35))
for(i in 1:5) for (j in 1:7) 
  for(ip in 1:5) for (jp in 1:7)
    if(i==ip | j==jp) BigMat[i+(j-1)*5,ip+(jp-1)*5]=0
> sum(BigMat)
[1] 840          ### compare with 35^2 = 1225
CountPrs = function(Xarr) {
     indv = c(ifelse(Xarr>0,1,0))
     sum(outer(indv,indv)*BigMat) }
> CountPrs(Xarr)
[1] 570

qTrans = function(Xarr) {
    Xnew = Xarr
    indv = c(ifelse(Xarr>0,1,0))
    kpr = sample((1:35^2)[outer(indv,indv)*BigMat > 0],1)-1
    k1 = kpr %% 35 ; k2 = kpr%/%35
    i = 1 + (k1%%5); j = 1 + (k1%/%5)
    ip = 1 + (k2%%5); jp = 1 + (k2%/%5)
    Xnew[i,j]=Xarr[i,j]-1
    Xnew[ip,jp]=Xarr[ip,jp]-1
    Xnew[ip,j]=Xarr[ip,j]+1
    Xnew[i,jp]=Xarr[i,jp]+1
    Xnew }

> qTrans(Xarr)-Xarr
     [,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,]    0    0    0    0    0    0    0
[2,]    0    0    0    0    0    0    0
[3,]    0    0    0    0    0    0    0
[4,]   -1    1    0    0    0    0    0
[5,]    1   -1    0    0    0    0    0
> qTrans(Xarr)-Xarr
     [,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,]    0    0    0    0    0    0    0
[2,]   -1    0    0    0    0    0    1
[3,]    1    0    0    0    0    0   -1
[4,]    0    0    0    0    0    0    0
[5,]    0    0    0    0    0    0    0
### Looks OK ...
### Now to code MCMC: note that equilibrium dist f
###   has const density although we do not know 
###   what that constant is ...

MCMCtab = function(Nrep,X0) {
    outarr = array(0, c(Nrep,dim(X0)))
    Xold=X0
    count.old = CountPrs(Xold)
    Uvec = runif(Nrep)
    for(i in 1:Nrep) {
      Xnew = qTrans(Xold)
      count.new = CountPrs(Xnew)
      if( Uvec[i] < count.old/count.new ) {
          count.old=count.new
          Xold = Xnew
          outarr[i,,] = Xold } 
    }
     ## Note that we do not record repeats !
    outarr }

> tmpout = MCMCtab(100,X0)
> table(apply(tmpout,1,sum) > 0)
FALSE  TRUE 
    3    97 

tmpout[100,,]-Xarr
          
### Now want to figure out how many distinct tables 
###   there are: so use "hash coding idea and create
###   a vector consisting of the distinct linear 
###   combinations with a fixed matrix of Unif's

> UnifTab = array(runif(35), c(5,7))
  HashSeq = numeric(1e4 + 1)
  HashSeq[1] = sum(Xarr*UnifTab)
  indpos = (1:100)[apply(tmpout,1,sum) > 0]
  HashSeq[1+indpos] = c(array(tmpout[indpos,,], 
            c(length(indpos),35)) %*% c(UnifTab))

> which(HashSeq[1:101]==0)
[1] 19 36 93
> which(duplicated(HashSeq[1:101]))
[1] 36 54 93 
    ### by chance, items 54 and 52 are identical

> for(i in 2:100) {
    Xold = tmpout[max(which(HashSeq[(i-2)*100+(2:101)] > 0)),,]
    tmpout = MCMCtab(100,Xold)
    indpos = (1:100)[apply(tmpout,1,sum) > 0]
    HashSeq[1+(i-1)*100+indpos] = c(array(tmpout[indpos,,], 
            c(length(indpos),35)) %*% c(UnifTab))
  }

> sum(HashSeq[1:10001] == 0)
[1] 393
> sum(duplicated(HashSeq[HashSeq > 0]))
[1] 73
### so 10001-393-73 are unique

HashSeq = c(HashSeq,rep(0,1e5-1e4))  ## now length 100001

for(i in 101:1000) {
    Xold = tmpout[max(which(HashSeq[(i-2)*100+(2:101)] > 0)),,]
    tmpout = MCMCtab(100,Xold)
    indpos = (1:100)[apply(tmpout,1,sum) > 0]
    HashSeq[1+(i-1)*100+indpos] = c(array(tmpout[indpos,,], 
            c(length(indpos),35)) %*% c(UnifTab))
  }

> sum(HashSeq > 0)
[1] 96570
> sum(duplicated(HashSeq[HashSeq>0]))
[1] 577      ### so we have visited 96570-577 distinct tables !!

HashSeq = c(HashSeq, rep(0,1e5))
unix.time({ for(i in 1001:2000) {
    Xold = tmpout[max(which(HashSeq[(i-2)*100+(2:101)] > 0)),,]
    tmpout = MCMCtab(100,Xold)
    indpos = (1:100)[apply(tmpout,1,sum) > 0]
    HashSeq[1+(i-1)*100+indpos] = c(array(tmpout[indpos,,], 
            c(length(indpos),35)) %*% c(UnifTab))
  }       })
   user  system elapsed 
 217.75    0.14  228.86   ### almost 4 minutes for 1e5 new tables!

> sum(HashSeq > 0)
[1] 192974
> sum(duplicated(HashSeq[HashSeq>0]))
[1] 1210                 ### Still almost all new tables !!


------------------------------------------------------------
PRELIMINARY TESTING OF CONVERGENCE FOR THE MCMC 
SIMULATION JUST DESCRIBED.

> summary(HashSeq)         ### 2e5 simulated in 2 batches of 1e5
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   27.27   28.38   27.47   29.48   34.84 

> cor(HashSeq[1:1e5],HashSeq[2:(1e5+1)])
[1] 0.07723625
> cor(HashSeq[1:1e5],HashSeq[(1e5+1):2e5])
[1] 0.002001812    ### OK: dependence is allowed for 
     ### successive indices, but INDEPENDENCE and 
### IDENTICAL DISTRIBUTION should appear over large lags
> tmppac =  pacf(HashSeq[1:1e5])
          ### shows steady decay from .08 down 
  ### to fluctuating values from .01 down to .002
  ### over lags 1:50

> tmppac2 = pacf(HashSeq[1e5+(1:1e5)])
   ### very similar in the second half

### Now create simulated 35-vectors of length 1000
### by sampling every 100th output table.

TabSeq = array(0, c(1000,5,7))
Hash = HashSeq[2e5-100+(2:101)]
Xold = tmpout[max(which(Hash > 0)),,]
for(i in 1:1000) {
    tmpout = MCMCtab(100,Xold)
    indpos = (apply(tmpout,1,sum) > 0)
    Hash = replace(rep(0,100),indpos, c(array(tmpout[indpos,,], 
            c(sum(indpos),35)) %*% c(UnifTab)))
    Xold = tmpout[max(which(Hash > 0)),,]
    TabSeq[i,,] = Xold    }

> tmpv = apply(TabSeq[,,7],1,max)
> table(tmpv[1:999],tmpv[2:1000])
      1   2   3   4
  1   6  52  21   7
  2  53 347 148  39
  3  20 147  72  19
  4   7  40  18   3
> chisq.test(table(tmpv[1:999],tmpv[2:1000]))

        Pearson's Chi-squared test

data:  table(tmpv[1:999], tmpv[2:1000]) 
X-squared = 2.5512, df = 9, p-value = 0.9795

### OK !! now do some qick checks to see that all 
###  of the possible rows and columns of the table
###  (one at a time) are sampled fully ...

length(unique(TabSeq[,,7] %*% runif(5)))
[1] 70          ### this is the right number !!

> table(apply(TabSeq[,,6],1,max))    ## marg total=10
  2   3   4   5   6   7   8   9  10 
  1 134 328 285 150  69  23   9   1    ### OK 
length(unique(TabSeq[,,6] %*% runif(5)))
[1] 543  ### actual max is: 

> rbind(table(TabSeq[1:500,1,6]), table(TabSeq[501:1000,1,6]))
       0   1  2  3  4  5 6 7
[1,] 155 122 96 66 39 15 6 1
[2,] 153 135 83 57 41 25 4 2

### NB. If we want to estimate the true max number 
###    of possible tables AND we believe that the 
###    two previous batches of 1e5 were each samples 
###    of the equilibrium distribution, then we could 
###    estimate the total roughly, as follows:

Uniques in first 1e5 batch:   95992

Uniques in 2nd 1e5 batch:     95773

Uniques visited in both batches: 
> length(intersect(unique(HashSeq[1:1e5]),
                 unique(HashSeq[1e5+(1:1e5)])))
[1] 3

So while for many purposes we may see 
equilibrium behavior, we do not sample all 
possibilities by any means: how could we 
check convergence ???

arrind = cbind(rep(2:3,c(3,3)),rep(3:5,2))
midsum = function(matr)  sum(matr[arrind])
           ### this is the sum of the following array
              #### of table entries:
> arrind
     [,1] [,2]
[1,]    2    3
[2,]    2    4
[3,]    2    5
[4,]    3    3
[5,]    3    4
[6,]    3    5
> midsum(TabSeq[77,,])
[1] 7
> TabSeq[77,,]
     [,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,]    0    0    1    4    1    2    0
[2,]    0    4    1    0    0    4    0
[3,]    4    0    4    2    0    1    3
[4,]    0    1    2    0    4    1    1
[5,]    3    3    0    0    2    2    0

> table(apply(TabSeq[1:500,,],1, midsum))
 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 
 2  2  7 17 31 57 70 86 75 59 40 32 12  5  4  1 

> table(apply(TabSeq[501:1000,,],1, midsum))
 3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 
 2  6 14 29 51 79 67 82 66 48 25 18  8  3  2 

### Looks OK !!! Let's group into cells: 
<6, 6, 7, ...,14, >14

> vec1 = hist(apply(TabSeq[1:500,,],1, midsum), 
         breaks=c(0,5.5:14.5,18), plot=F)$count 
  vec2 = hist(apply(TabSeq[501:1000,,],1, midsum), 
         breaks=c(0,5.5:14.5,18), plot=F)$count 
> sum((vec1-vec2)^2/(vec1+vec2))
[1] 7.90543    ### fine for 2-sample chisq, df=10

### Try one more exhibit: table row-column chisq stat

> statSeq = apply(TabSeq,1, function(matr) chisq.test(matr)$stat)
### looks very noisy and erratic when plotted in time sequence, BUT
> pacf(statSeq)
   ### looks significant at lag 4 and 6 !!

TabSeq2 = array(0, c(1000,5,7))
for(i in 1:1000) {
    tmpout = MCMCtab(100,Xold)
    indpos = (apply(tmpout,1,sum) > 0)
    Hash = replace(rep(0,100),indpos, c(array(tmpout[indpos,,], 
            c(sum(indpos),35)) %*% c(UnifTab)))
    Xold = tmpout[max(which(Hash > 0)),,]
    TabSeq2[i,,] = Xold    }

> statSeq2 = apply(TabSeq2,1, function(matr) chisq.test(matr)$stat)
  pacf(statSeq2)
### now the only pacf that looks a little bit significant 
###   is at lag 17, so it seems very likely that 
###   none of these apparent effects is real !

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Try a little further testing based on coding of metric for 
concentration of table entries:

> Xarr
     [,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,]    0    3    1    1    1    2    0
[2,]    1    0    1    3    2    2    0
[3,]    3    3    2    1    1    1    3
[4,]    1    1    1    1    1    3    1
[5,]    2    1    3    0    2    2    0

> tmpout = MCMCtab(1000,Xarr)
> dim(tmpout)
[1] 1000    5    7
> tmpout = MCMCtab(1000,tmpout[1000,,])
  tvec = apply(tmpout,1,function(tab) = var(c(tab)))
> acf(tvec,lag.max=50)   ### looks pretty good
> spectrum(tvec, method="ar")
> tmpout = MCMCtab(1000,tmpout[1000,,])
> spectrum(tvec, method="ar")
> tmpout = MCMCtab(1000,tmpout[1000,,])
> spectrum(tvec, method="ar") 
        #### Looks really similar each time !

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HW Problem based on MCMC sampling of uniform point in R^7 simul-
taneously satisfying 15 specified feasible linear inequalities
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

CONSIDER SOME MUCH SIMPLER EXAMPLES FOLLOWING KEDEM (1999)

(a) MCMC for Generating N(0,1) Data From DoubExp random walk
##  We want to simulate X_t ~ N(0,1), i.e., f(x) = dnorm(x), 
##  with proposal  kernel q giving iid double exponential 
##  transition-steps,  q(y|x) = .5*exp(-abs(y-x)),  NB: symmetric !

unix.time({ 
   DEseq = sample(c(-1,1), 1e5, rep=T)*rexp(1e5)
   Useq = runif(1e5)   ### initial Y0=0
   Yseq = numeric(1e5+1)
   for(i in 1:1e5) {
    new=Yseq[i]+DEseq[i]
    Yseq[i+1] = if(Useq[i]<dnorm(new)/dnorm(Yseq[i]))
                new else Yseq[i] }
    })
   user  system elapsed 
  11.41    0.00   11.54 

> hist(Yseq[1+(1:1e5)], nclass=50, prob=T)
   curve(dnorm(x), add=T, lty=3)           ### perfect !

> sum(duplicated(Yseq))
[1] 33492               ### This is the number of times, 
    ### roughly 1 in 3 where no new deviate is generated.
> acf(Yseq[!duplicated(Yseq)])
### dependence dies away completely by lag 10

(b) Now re-do MCMC with proposal kernel giving iid DoubExp
### that is, use same f but nonsymmetric  q(y|x) = .5*exp(-abs(y))

unix.time({ 
   DEseq = sample(c(-1,1), 1e5, rep=T)*rexp(1e5)
   Useq = runif(1e5)   ### initial Y0=0
   Yseq = numeric(1e5+1)
   for(i in 1:1e5) {
    new=DEseq[i]
    Yseq[i+1] = if(Useq[i]<dnorm(new)*exp(abs(new)-
            abs(Yseq[i]))/dnorm(Yseq[i]))
              new else Yseq[i] }
    })
   user  system elapsed 
  12.47    0.00   12.59 
> hist(Yseq[1+(1:1e5)], nclass=50, prob=T)
   curve(dnorm(x), add=T, lty=3)   ### again perfect !
> qqnorm(Yseq[2:(1e5+1)])

  ### But in the second method, distinct results are iid 
### (lag >= 1 shows no dependence in: acf(Yseq[!duplicated(Yseq)]) 
### thus the second method exactly coincides with accept-reject.

> sum(duplicated(Yseq))
[1] 16422                   ### far fewer "wasted" repeats