HW3 Solutions Log                               3/12/03

(A) Idea is to simulate (x,y) pairs in unit square and reject 
if  ( x<1/3 & y>3*x ) or (x >= 1/3 & y> 1.5*(1-x)). Probability 
of rejection is exactly 1/2 each time. So to generate 1000 pairs, 
you will generate about E(Geom(0.5))*1000 = 2000 pairs of 
Unif[0,1] variates (U_1,U_2). The probability that it will take more
than 2200 tries to get 1000 successes is pbinom(999,2200,.5)= 9e-6,
so we generate 2200 pairs outright.

> Umat <- matrix(runif(4400), ncol=2)
> Tvec <- (1:2200)[(Umat[,1] < 1/3 & Umat[,2] <= 3*Umat[,1]) |
        (Umat[,1] >= 1/3 & Umat[,2] <= 1.5*(1-Umat[,1]))]
> XYmat <- Umat[Tvec[1:1000],]
> motif()
> plot(XYmat[,1],XYmat[,2])plot(XYmat[,1],XYmat[,2])

> unix.time( { Umat <- matrix(runif(4400), ncol=2)
         Tvec <- (1:2200)[(Umat[,1] < 1/3 & Umat[,2] <= 3*Umat[,1]) |
        (Umat[,1] >= 1/3 & Umat[,2] <= 1.5*(1-Umat[,1]))]
         XYmat <- Umat[Tvec[1:1000],] })  ### .02 secs

(B)
> Umat <- matrix(runif(2000), ncol=2)
### Use Prob. integral transform and cond'l transform.
> inds <- Umat[,1] <= 1/3
  XYmat <- array(0, dim=c(1000,2))
  XYmat[inds,1] <- sqrt(Umat[inds,1]/3)
  XYmat[inds,2] <- 3*XYmat[inds,1]*Umat[inds,2]
  XYmat[!inds,1] <- 1-sqrt((2/3)*(1-Umat[!inds,1]))
  XYmat[!inds,2] <- 1.5*(1-XYmat[!inds,1])*Umat[!inds,2]
> plot(XYmat[,1],XYmat[,2])
> unix.time( {Umat <- matrix(runif(2000), ncol=2)
  inds <- Umat[,1] <= 1/3
  XYmat <- array(0, dim=c(1000,2))
  XYmat[inds,1] <- sqrt(Umat[inds,1]/3)
  XYmat[inds,2] <- 3*XYmat[inds,1]*Umat[inds,2]
  XYmat[!inds,1] <- 1-sqrt((2/3)*(1-Umat[!inds,1]))
  XYmat[!inds,2] <- 1.5*(1-XYmat[!inds,1])*Umat[!inds,2]})
### .02 secs essentially the same !!!

Third method, using special geometry and prob 1/3 that X < 1/3
> unix.time({ XYmat <- matrix(runif(2000), ncol=2)
     indsA <- (XYmat[,1] < 1/3) & (XYmat[,2] > 3*XYmat[,1])
     indsB <- (XYmat[,1]>= 1/3) & (XYmat[,2] > 1.5*(1-XYmat[,1]))
     XYmat[indsA,] <- cbind(rep(1/3,sum(indsA)),
              rep(1,sum(indsA)))-XYmat[indsA,]
     XYmat[indsB,] <- cbind(rep(4/3,sum(indsB)),
              rep(1,sum(indsB)))-XYmat[indsB,]})
### .02 secs essentially the same !!!
### Too hard to choose between these !! Re-do them with 10000
###   iterations.


> unix.time( { Umat <- matrix(runif(41500), ncol=2)
         Tvec <- (1:20750)[(Umat[,1] < 1/3 & Umat[,2] <= 3*Umat[,1]) |
        (Umat[,1] >= 1/3 & Umat[,2] <= 1.5*(1-Umat[,1]))]
         XYmat <- Umat[Tvec[1:10000],] })
[1] 0.150 0.030 1.00 0 0		       ###  Method 1.

> unix.time( {Umat <- matrix(runif(20000), ncol=2)
  inds <- Umat[,1] <= 1/3
  XYmat <- array(0, dim=c(10000,2))
  XYmat[inds,1] <- sqrt(Umat[inds,1]/3)
  XYmat[inds,2] <- 3*XYmat[inds,1]*Umat[inds,2]
  XYmat[!inds,1] <- 1-sqrt((2/3)*(1-Umat[!inds,1]))
  XYmat[!inds,2] <- 1.5*(1-XYmat[!inds,1])*Umat[!inds,2]})
[1] 0.140 0.01 1.00 0 0 	       ###  Method 2.

> unix.time({ XYmat <- matrix(runif(20000), ncol=2)
     indsA <- (XYmat[,1] < 1/3) & (XYmat[,2] > 3*XYmat[,1])
     indsB <- (XYmat[,1]>= 1/3) & (XYmat[,2] > 1.5*(1-XYmat[,1]))
     XYmat[indsA,] <- cbind(rep(1/3,sum(indsA)),
              rep(1,sum(indsA)))-XYmat[indsA,]
     XYmat[indsB,] <- cbind(rep(4/3,sum(indsB)),
              rep(1,sum(indsB)))-XYmat[indsB,]})
[1] 0.13 0.00 1.00 0.00 0.00 	       ###  Method 3.

## So there is a slight preference for method 2 or 3.

(C) Now consider 2x2 tables and Xsq statistic:

> unix.time({n1plus <- rbinom(1000,100,.6)
  n11 <- rbinom(1000,n1plus,.7)
  n21 <- rbinom(1000,100-n1plus,.7)
### This is much simpler than working directly with "sample" !
  nplus1 <- n11+n21
  nexp <- cbind((n1plus/100)*cbind(nplus1,100-nplus1), 
     (1-n1plus/100)*cbind(nplus1,100-nplus1))
  nobs <- cbind(n11,n1plus-n11,n21,100-n1plus-n21)
  Xsq <- c(((nobs-nexp)^2/nexp) %*% rep(1,4))})
  c(summary(Xsq), rejfrac=mean(Xsq>3.84))
 Min. 1st Qu. Median  Mean 3rd Qu.  Max. rejfrac 
    0   0.111 0.5432 1.052   1.412 13.24   0.049
## On another timing run, which took 0.06 secs:
 Min. 1st Qu. Median  Mean 3rd Qu.  Max. rejfrac 
    0 0.08823 0.4884 1.042   1.338 14.65   0.057
## In these cases, CI width is 1.96*sqrt(.05*.95/1000) = 0.0135

(D) Now try importance-sampling

This problem, which we discussed twice in class [at which point I had
still not solved it correctly!] is a real object-lesson in the
difficulties of importance sampling. The point is that too restrictive
a set of working importance-sampling densities from which to choose
may leave no good options !!

In this problem, we began by trying to do importance-sampling from a
working density (or rather, probability mass function) which was
itself multinomial. Let's first go through those steps and understand
how and why the importance-sampling estimators fail to work.

> Import
function(pvec, Nrep = 1000)
{
   p1plus <- pvec[1] + pvec[3]
   p1f <- pvec[1]/p1plus
   p2f <- pvec[2]/(1 - p1plus)
   n1plus <- rbinom(Nrep, 100, p1plus)
   n11 <- rbinom(Nrep, n1plus, p1f)
   n21 <- rbinom(Nrep, 100 - n1plus, p2f)
   ### This is much simpler than working directly with "sample" !
   nplus1 <- n11 + n21
   nexp <- cbind((nplus1/100) * cbind(n1plus, 100 - n1plus), (1 - nplus1/
      100) * cbind(n1plus, 100 - n1plus))
   nobs <- cbind(n11, n21, n1plus - n11, 100 - n1plus - n21)
   Xsq <- ((nobs - nexp)^2/nexp) %*% rep(1, 4)
   Ratio <- exp(nobs %*% log(c(0.42, 0.28, 0.18, 0.12)/pvec))
   list(WtRej = (Xsq > 3.84) * Ratio, Xsq = Xsq, Ratio = Ratio, p1plus = 
      p1plus, p1f = p1f, p2f = p2f, nobs = nobs, pvec = pvec, n1plus
       = n1plus)
}

> tmpo <- Import(c(.42,.28,.18,.12))
> table(tmpo$WtRej)
   0  1 
 957 43      ### typical null-hypothesis run: .045 = size

### To get the right answer:

> tmpo <- Import(c(.42,.28,.18,.12), Nrep=1e5)
> c(mean(tmpo$WtRej) -> rp, 1.96*sqrt(rp*(1-rp)/1e5))
[1] 0.050740000 0.001360266 ## Interval is (.04938, .0521)


### Now proceed with p1plus=.6, p1f+p2f=.7, and
### pvec[1]  based on hypergeometric variance and normal 
### approximation, equal to 
(42+1.96*sqrt(60*70*30*40/(99*100^2))/100 = 0.4642
### It is easy to verify that the Xsq chi-square statistic for
### independece of row and column classifications is 3.84 
### when the expected table values (with table-total 100) 
### are substituted as data.

> tmpo <- Import(c(.464, .236, .136, .164))
> mean(tmpo$Ratio)
[1] 0.9171026				### Not too terrible
> apply(c(tmpo$Ratio)*tmpo$nobs,2,mean)
      n11      n21                   
 38.32863 25.44641 16.23419 11.70104   ## But also not very accurate
> mean(tmpo$WtRej)
[1] 0.0250326  ### Only upper-tail rejection cases accurately estimated!
> tmpo <- Import(c(.464, .236, .136, .164))  ## 2nd run 
> apply(c(tmpo$Ratio)*tmpo$nobs,2,mean)
      n11      n21                   
 31.43296 20.03611 12.40591 9.479093   ## Not accurate at all!
> mean(tmpo$Ratio)
[1] 0.7335407		### This is a large part of the reason!!


I mentioned in class that one could try to search for a good
probability-vector with which to do importance sampling by
constraining the Xsq statistic to 3.84 for the expected table
(with specified table total, default 100) and trying to optimize
Kullback-Leibler values. Ordinarily this is a good strategy, and I
tried to implement it using the  following little function:

## We code the calculation of probability vectors qvec to try
## into a function, allowing also the possibility 
## that the row-sum a not = .6, and col-sum b not  =.7.
## The function also calculates Kullback-Leibler distance KL:

> KLfcnX
function(a, b, q11 = NULL, n = 100, cut = 3.84, p0 = 
      c(0.42, 0.28, 0.18, 0.12), up = T)
{
## Here a = 1st row-prob, b=1st col-prob, and 
## if q11 is not specified, we put it equal to the
## upper-cell prob. s.t. Xsq for expected table = cut .
   if(is.null(q11)) q11 <- max(0, min(a * b + (2 * as.numeric(up) - 1) * 
         sqrt((cut * a * (1 - a) * b * (1 - b))/n), 1))
   qvec <- c(q11, b - q11, a - q11, 1 - a - b + q11)
   list(KL = sum(qvec * log(qvec/p0)), KL2 = sum(p0 * log(
       p0/qvec)), qvec = qvec)
}

> unlist(nlminb(c(.55,.65), function(av) KLfcnX(av[1],
         av[2])$KL)[c(1,2,4)])
 parameters1 parameters2  objective    grad.norm 
   0.6010131   0.7003305 0.01898192 3.769823e-08

### This shows that one cannot do better than  the values:

> round(unlist(KLfcnX(.6,.7)),5)
      KL   qvec1   qvec2   qvec3   qvec4 
 0.01898 0.46399 0.23601 0.13601 0.16399

## Here is a `good' batch, but still not accurate enough !!

> tmpo <- Import(KLfcnX(.6,.7,cut=.384)$qvec)  ## about 90 rejections
> mean(tmpo$WtRej)
[1] 0.05281012
> mean(tmpo$Ratio)
[1] 0.9978234
> tmpo$pvec
[1] 0.4339117 0.2660883 0.1660883 0.1339117
> apply(tmpo$nobs,2,mean)
    n11    n21               
 43.378 26.742 16.496 13.384
> apply(c(tmpo$Ratio)*tmpo$nobs,2,mean)
      n11      n21                   
 41.92289 28.17413 17.73505 11.95027
> var(tmpo$WtRej)  ## 0.1287217
### This means the width of the CI is 
> 1.96*sqrt(var(tmpo$WtRej)/1000)  ## = .0222

### In this example, although we can arrange for more rejections
###   under the modified multinomial distribution, importance 
###   sampling appears to INCREASE variability !

### The reason, in  large part, is the terrific variability of
### the simulated likelihood ratios !!!

==========================================================

If you tried this problem for a number of different (multinomial)
parameter combinations, you probably found that often the `simulated'
rejection probability was from .02 to .03, which might lead you to
suspect that only one of the two tails was really being sampled in the
limited number of 1000 replications. That is reasonable: in the
multinomial distributions used so far in importance-sampling, those
chi-squared rejection cases where the upper-left cell count is too
SMALL are too rare to be represented well (or maybe at all!?) in
simulations with 1000 replications.

This suggests the possibility that it might be clever to do importance
sampling by generating 50% of tables with n11 too high and 50% with it
too low. That is, perhaps things would behave a little more stably if
we simulated from a MIXTURE of two multinomial distributions and made
THAT the working g. Here is a function which does it.

> ImportMix
function(a, b, Delta, Nrep = 1000, p0 = c(0.42, 0.28, 0.18, 0.12))
{
### Here a is 1st row-sum, b is 1st col-sum, and 
###  Delta is abs(q11-a*b). We simulate and do importance-
###  sampling from the mixture of two multinomials with 
###  equal weight, with q11 = ab plus or minus Delta.
        uppr <- 2 * rbinom(Nrep, 1, 0.5) - 1
        p1f <- b + (Delta * uppr)/a
        p2f <- b - (uppr * Delta)/(1 - a)
        pvec <- p0 + c(1, -1, -1, 1) * Delta
        pvec2 <- p0 - c(1, -1, -1, 1) * Delta
        n1plus <- rbinom(Nrep, 100, a)
        n11 <- rbinom(Nrep, n1plus, p1f)
        n21 <- rbinom(Nrep, 100 - n1plus, p2f)
        ### Much simpler than working directly with "sample" !
        nplus1 <- n11 + n21
        nexp <- cbind((nplus1/100) * cbind(n1plus, 100 - n1plus), 
            (1 - nplus1/100) * cbind(n1plus, 100 - n1plus))
        nobs <- cbind(n11, n21, n1plus - n11, 100 - n1plus - n21)
        Xsq <- ((nobs - nexp)^2/nexp) %*% rep(1, 4)
        Ratio <- 2/(exp(nobs %*% log(pvec/p0)) + exp(nobs %*% 
            log(pvec2/p0)))
        list(WtRej = (Xsq > 3.84) * Ratio, Xsq = Xsq, Ratio = 
            Ratio, nobs = nobs, pvec = pvec, n1plus = n1plus)
}

> tmpo <- ImportMix(.6,.7,.036)
> tmpo$pvec
[1] 0.456 0.244 0.144 0.156
> summary(tmpo$Ratio)
    Min. 1st Qu. Median   Mean 3rd Qu.  Max. 
 0.00453  0.2013 0.5443 0.9848   1.425 4.627
> round(c(mean(tmpo$WtRej), var(tmpo$WtRej), 1.96*sqrt(var(
      tmpo$WtRej)/1000)),5)
[1] 0.05108 0.00763 0.00541
> sum(tmpo$WtRej>0)  ###     348 = # rejections

> tmpo <- ImportMix(.6,.7,.044)
> tmpo$pvec
[1] 0.464 0.236 0.136 0.164
> summary(tmpo$Ratio)
     Min. 1st Qu. Median  Mean 3rd Qu.  Max. 
 0.000545 0.07522 0.2778 1.049   1.074 11.63
> round(c(mean(tmpo$WtRej), var(tmpo$WtRej), 1.96*sqrt(var(
      tmpo$WtRej)/1000)),5)
[1] 0.04983 0.00565 0.00466
> sum(tmpo$WtRej>0)  ###     505 = # rejections

> tmpo <- ImportMix(.6,.7,0)
> round(c(mean(tmpo$WtRej), var(tmpo$WtRej), 1.96*sqrt(var(
      tmpo$WtRej)/1000)),5)  
[1] 0.05100 0.04845 0.01364 

 ### So this idea WORKS --- you can see that the width of the
 confidence interval based on an importance-sample of 1000 
 is about 1/3 as wide in a straighforward vanilla Monte Carlo
 simulation. So the gains are very worthwhile !!!

### A DIFFICULT PROBLEM ! The importance-sampling gains from a mixture
    of multinomials in this problem do not seem easy to summarize in
    terms of large-deviation probabilities. The point seems to be that 
    the mixture g makes a lot more cases of interest in the Xsq
    rejection region have likelihood ratios f(X)/g(X) which are not
    too large or small.