STAT 705, Fall 2010

      Monte Carlo  Generation of Ranodom Variables with R
    ======================================================

Let F(x) be cdf, and U ~ Unif[0,1]. Define:

        X = minimum {x: F(x) >= U} = F^{-1}(U)

Then X ~ F(x).


0. R Functions for generating data
---------------------------------------

runif(n), rnorm(n,0,1), rbinom(n,n,p), rpois(n, lambda),...


1. Generating X ~ P(X=j)=1/6, j=1,2,3,4,5,6
--------------------------------------------------

Use floor(x): greatest integer leq x.

Example:
Suppose U=3/12= 1/6 + 1/12 ===> 1/6< U <2/6 ===> X=2
Get this from:  floor(6*3/12 +1)=floor(2.5)=2, etc. 


X <- 1:120        #Generation of 120 X's
count <- rep(0,6) #Input six 0's
for(i in 1:120){
X[i] <- floor(6*runif(1)+1)  #floor(x)= Greatest integer <= x
count[X[i]] <- count[X[i]] + 1}

> X
  [1] 6 6 1 5 5 4 6 6 4 3 3 2 6 3 6 2 4 3 1 2 2 2 5 4 6 3 4 6 1 5 3 3 5 1 2 6 4
 [38] 5 6 5 3 1 5 2 5 1 1 5 5 6 3 2 5 4 3 4 6 1 2 5 2 4 3 3 4 5 6 6 1 6 5 1 6 6
 [75] 1 3 4 6 3 1 1 6 3 6 5 2 4 4 5 3 1 2 3 4 1 4 2 5 2 4 5 1 2 2 6 4 5 2 1 4 3
[112] 4 6 5 6 5 1 5 6 4

> count
[1] 18 17 18 20 23 24


#Barplot:
par(oma=c(0,0,4,0)) #For Overall Title
par(mfrow=c(1,2))

names(count) <- c("1","2","3","4","5","6")
barplot(count, names=names(count))            #Simulation I
title("18 17 18 20 23 24 ")


names(count) <- c("1","2","3","4","5","6")    #Simulation II
barplot(count, names=names(count))
title("25 18 14 27 13 23 ")


mtext("FIG 1: 120 Die Tossings", side=3,outer=T, cex=1.5) #Overall Title

--------------------------------------------

More general: Generate n RV's from P(X=j)=1/6, j=1,...,6.
              Add Chi-Square Goodness of Fit Test.


Die <- function(n){ 
X <- 1:n  
count <- rep(0,6)
for(i in 1:n){
X[i] <- floor(6*runif(1)+1)
count[X[i]] <- count[X[i]] + 1}
ChiSq <- sum((6/n)*(count-n/6)^2)
pvalue <- 1-pchisq(ChiSq,5)
list("Count"=count, "ChiSq"=ChiSq,"pvalue"=pvalue)
}

> qchisq(0.95,5)
[1] 11.0705      #Critical value for alpha=0.05, df=5


> Die(120)
$Count:
[1] 20 21 22 18 19 20

$ChiSq:
[1] 0.5    #OK 

$pvalue:
[1] 0.9921233

> Die(120)
$Count:
[1] 16 22 20 17 24 21

$ChiSq:
[1] 2.3    #OK

$pvalue:
[1] 0.8062669

> Die(120)
$Count:
[1] 21 29 10 25 15 20

$ChiSq:
[1] 11.6   #Reject; not strongly.

$pvalue:
[1] 0.04069939

> unix.time(Die(2400))
  user  system elapsed
  0.062   0.003   0.067
---------------------------------------------

@@@@@@@@@@@@ Efficient Program: No "for" Loops @@@@@@@@@@@

Die <- function(n){ 
X <- floor(6*runif(n)+1)
f <- table(X)
count <- c(f[[1]],f[[2]],f[[3]],f[[4]],f[[5]],f[[6]])
ChiSq <- sum((6/n)*(count-n/6)^2)
pvalue <- 1-pchisq(ChiSq,5)
list("X"=X, "Table"=f, "Count"=count, "ChiSq"=ChiSq, "pvalue"=pvalue)}

>  Die(120)
$X
  [1] 6 5 6 6 2 1 3 2 3 6 4 5 1 6 6 4 1 4 4 5 6 5 2 1 6 1 4 2 3 1 5 4 6 3 4 3 5
 [38] 3 3 6 1 4 1 3 3 5 3 1 1 2 5 1 1 3 1 6 4 5 1 6 4 3 5 3 6 5 1 3 6 5 2 1 3 6
 [75] 3 1 1 3 2 3 5 3 5 3 3 1 1 5 4 5 3 1 1 3 1 2 2 5 2 4 4 1 2 6 4 1 6 6 1 5 3
[112] 3 1 5 6 6 3 2 3 2

$Table
X
 1  2  3  4  5  6
27 13 27 14 19 20

$Count
[1] 27 13 27 14 19 20

$ChiSq
[1] 9.2

$pvalue
[1] 0.1013479


> Die(120)
$X
  [1] 5 5 6 3 2 6 5 6 6 6 3 4 5 3 5 4 4 5 6 3 4 5 2 5 3 3 3 6 4 6 4 3 5 4 2 1 3
 [38] 1 6 1 3 1 5 5 1 3 3 4 5 5 3 1 1 4 2 1 4 4 3 6 4 2 1 1 5 3 1 3 2 3 4 3 3 2
 [75] 1 2 2 2 4 3 6 1 4 2 1 4 2 4 2 3 5 6 2 5 1 5 2 5 6 2 2 2 1 6 2 6 3 3 6 3 1
[112] 4 5 3 2 1 5 4 4 5

$Table
X
 1  2  3  4  5  6
18 20 25 20 21 16

$Count
[1] 18 20 25 20 21 16

$ChiSq
[1] 2.3

$pvalue
[1] 0.8062669  ##Better


> unix.time(Die(2400))  ###Faster here!
   user  system elapsed
  0.002   0.000   0.002


2. Generate Exponential(Lambda)  RV's
---------------------------------------


Exponential <- function(n,lambda){  
U <- runif(n)
X <- -(log(1-U))/lambda
X}

> Exponential(10,1)
 [1] 3.2354922 2.7719491 0.0451040 1.4465477 1.2236916 0.7002736 2.6401330
 [8] 1.8342899 0.7920876 0.6690501


##Or: Since  U and 1-U Both Unif(0,1), we may replace 1-U by U

Exponential <- function(n,lambda){  
U <- runif(n)
X <- -(log(U))/lambda 
list("lambda"=lambda, "X"=X)
}

> ExpData <- Exponential(1000,1)
> ExpData$lambda
[1] 1


> summary(ExpData$X)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
3.433e-05 3.015e-01 6.435e-01 9.464e-01 1.254e+00 8.912e+00

 > mean(ExpData$X)
[1] 0.9464022

> var(ExpData$X) 
[1] 0.9790306

##Check against rexp(1000,1) from R

ExpData <- Exponential(1000,1)
qqplot(ExpData$X,rexp(1000,1),main="QQ-Comparison")
abline(0,1)


3. Mixtures
--------------
Fraction of p customers replace their batteries at failure or one year
of usage, whichever comes first. The remaining q=1-p customers replace
their baterries at failure. We have:

X ~ Exponential(lambda)
Y= Min{X,1}
Z= Y w.p. p and Z=X w.p. q

### Problem: Generate Z's.


Note: To get Y= Min{X,1} can use: Y <- ifelse((X < 1), Y <- X, Y <- 1)
      or simply: Y <- pmin(X,1). But Y <- min(X,1) Is Not Good Here!

> X <- rexp(10,1)
> X
 [1] 1.0274276 1.6540601 3.8138664 2.1656609 0.2545461 3.5295724 1.6939928
 [8] 0.2057234 0.9062887 1.7004219


> Y <- ifelse((X < 1), Y <- X, Y <- 1)
> Y
 [1] 1.0000000 1.0000000 1.0000000 1.0000000 0.2545461 1.0000000 1.0000000
 [8] 0.2057234 0.9062887 1.0000000

Or, simpler:

> Y <- pmin(X,1)
> Y
 [1] 1.0000000 1.0000000 1.0000000 1.0000000 0.2545461 1.0000000 1.0000000
 [8] 0.2057234 0.9062887 1.0000000


But not this

> Y <- min(X,1)
> Y
[1] 0.2057234


Method I
=========

ExpoMix <- function(n,lambda,p){  
U <- runif(n)              #The first 2 lines can be replaced
X <- -(log(1-U))/lambda    #by   "X <- rexp(n,lambda)"
Y <- pmin(X,1)  
U <- runif(n)              #Need to generate mixture indicator
Z <- ifelse((U < p), Z <- Y, Z <- X)
ProbZequalOne <- p*exp(-lambda)  ##True prob P(Z=1)   
Error <- abs(ProbZequalOne - sum(Z ==1)/n)  ##Check P(Z=1)
HatProbZequalOne <- sum(Z ==1)/n ##Est of P(Z=1)
list("ProbZequalOne"=ProbZequalOne, "HatProbZequalOne"=HatProbZequalOne, 
     "Z"=Z, "Error"=Error)}


> Mixture <- ExpoMix(1000,1,0.4)

> Mixture$ProbZequalOne ##True P(Z=1)
[1] 0.1471518
> Mixture$HatProbZequalOne ##Est of P(Z=1)
[1] 0.144
> Mixture$Error
[1] 0.003151776

> Mixture$Z[1:20]
 [1] 1.00000000 0.11535434 0.48823905 1.23766697 0.12342818 0.08362285
 [7] 0.69099087 0.28763913 1.80842775 1.00000000 0.35905464 0.89413019
[13] 0.05253795 0.20578574 0.27968516 0.15701067 1.06495353 0.41765359
[19] 0.15329962 0.68511337


Method II: Check Method I.
=========
Use the inverse of mixture df directly.

Note Example: To get

     0, if v < 4
Z = 40, if 4 <= v <= 7
    20, if v > 7 

Use Nested "ifelse"

> v <- c(1,2,3,4,5,6,7,8,9)
> z <- ifelse((v < 4), z <- 0,ifelse((v > 7), z <- 20, z <- 40))
[1]  0  0  0 40 40 40 40 20 20


Call the data ZZ.

ExpoMixII <- function(n,lambda,p){ 
U <- runif(n)
ZZ <- ifelse((0 <= U) & (U <= 1- exp(-lambda)), ZZ <- (-1/lambda)*log(1-U),
ifelse((1- exp(-lambda) < U) & (U <= 1-(1-p)*exp(-lambda)), ZZ <- 1,
ZZ <- (-1/lambda)*log((1-U)/(1-p))))
ProbZZequalOne <- p*exp(-lambda)     
Error <- abs(ProbZZequalOne - sum(ZZ ==1)/n)  #Check P(ZZ=1)
HatProbZZequalOne <- sum(ZZ ==1)/n
list("HatProbZZequalOne"=HatProbZZequalOne, "ZZ"=ZZ,"Error"=Error)
}

MixtureII <- ExpoMixII(1000,1,0.4)

> MixtureII$HatProbZZequalOne
[1] 0.149
> MixtureII$Error
[1] 0.001848224

> MixtureII$ZZ[1:20]
 [1] 0.18448242 0.17954087 0.07321502 3.49700058 0.07644807 1.51108284
 [7] 0.37363428 1.42463626 1.38869435 0.44310912 1.00000000 0.62670673
[13] 0.29317864 0.26155529 0.38255592 0.92457030 1.08222099 1.01251498
[19] 0.72811314 0.87255661
 

Comparisons.
-------------

par(mfrow=c(2,2)); par(oma=c(0,0,2,0)) #Overall Title 

qqplot(Mixture$Z,MixtureII$ZZ) #Compare the two methods: Z vs ZZ.
abline(0,1)
title("Two Methods: Z,ZZ")

> EX <- rexp(1000,1) 
> YY <- pmin(EX,1)

qqplot(Mixture$Z,EX) #Compare Z with EX ~ exponential(1)
abline(0,1)
title("Z, EXP(1)")

qqplot(Mixture$Z,YY) #Compare Z with Y=min(EX,1)
abline(0,1)
title("Z, YY = min(EXP(1),1)")

qqplot(EX,YY)
abline(0,1)
title("EXP(1), YY = min(EXP(1),1)")

mtext("Comparison of Z,ZZ,EXP(1),YY = min(EXP(1),1)", side=3,outer=T)


4. Contamination (Also Mixiture!)
-------------------------------------

"contam"  is N(0,1) w.p. .9 and N(0,9) w.p. 0.1. 
Trick is to randomize the variance: 1 w.p. .9 and 3^2 w.p. 0.1. 

contam <- rnorm(200,0, 1+2*rbinom(200,1,0.1))
stem(contam)

N = 200   Median = 0.0361128
Quartiles = -0.76384, 0.860758

Decimal point is at the colon

  -3 : 9
  -3 :
  -2 : 776
  -2 : 2110
  -1 : 9887776555555
  -1 : 4444443333222100
  -0 : 99999888888888777777776666666555555555
  -0 : 44443333332222221100
   0 : 00000011122222222333333334444444444
   0 : 55556666777777888889999999
   1 : 000001111111122222233344
   1 : 55667778889
   2 : 01344
   2 :
   3 : 4
   3 : 9

High: 4.89752 7.31264    #<---- Largest values of sample.


5. Generating Bivariate RV's
---------------------------------------

U1,U2, iid Unif[0,1]. Assume we have F(x), F(y|x). 

Step 1: Use U1 to get X from F(x). 
Step 2. Use U2 to get Y from F(y|X=x)
Then: (X,Y) ~ f(x,y)

Example: Generate (X,Y) from f(x,y)=exp(-y), 0<x<y.

Then X ~ Exponential(1), Y|X=x ~ exp[-(y-x)], 0<x<y
and,
 
X=-log(1-U1), Y= X -log(1-U2)

biv <- function(n){
U1 <- runif(n)
U2 <- runif(n)
X <- -log(1-U1)
Y <- X -log(1-U2)
corrXY <- cor(X,Y)     #For a verification of the simulation.
list("X"=X,"Y"=Y,"corrXY"=corrXY)}


> result <- biv(1000)
> result$corrXY
[1] 0.7060981     #<-- OK: True Corr(X,Y)=sqrt(2)/2=0.7071068   


plot(result$X,result$Y, main="f(x,y)=exp(-y), 0<x<y")
legend(4,2.25,c("Corr(X,Y)=0.707", "Sample Corr=0.711")) 

--------------------
Case of Dependent U1,U2. <----No! No! No!
U1,U2, must be independent. Dependence may lead to errors.

biv <- function(n){
U1 <- runif(n)
U2 <- runif(n)
X <- -log(1-U1)
Y <- X -log(U1)        #Dependence: U2=1-U1
corrXY <- cor(X,Y)     #For a verification of the simulation.
list("X"=X,"Y"=Y,"corrXY"=corrXY)}

result <- biv(1000)
> result$corrXY
[1] 0.3498812          #Incorrect correlation.

plot(result$X,result$Y, main="U2=1-U1")
---------------------


6. Generating Multivariate Normal Random Vectors
---------------------------------------------------

Scalar Case: Box-Muller algorithm generates N(0,1)'s from Unif[0,1]'s

      U1,U2 iid Unif[0,1] ====> Z1,Z2 iid N(0,1)
By:
      Z1 <- sqrt(-2*log(U1))*cos(2*pi*U2)
      Z2 <- sqrt(-2*log(U1))*sin(2*pi*U2)

> BoxMuller <- function(n)
{
        U1 <- runif(n)
        U2 <- runif(n)
         Z <- sqrt(-2 * log(U1)) * cos(2 * pi * U2) #Suffices.
       #Z2 <- sqrt(-2 * log(U1)) * sin(2 * pi * U2) 
        Z
}

Compare with "rnorm":

Y <- BoxMuller(1000)
Z <- rnorm(1000)
qqplot(Z,Y)
>abline(0,1)


------------
Vector Case:
Suppose we wish to generate

         X ~ N(mu,Sigma), mu mean vector, Sigma pd. cov. matrix.

Step 1. Sigma= AA', A lower triangular
Step 2. Get Z ~ N(0,I)
Step 3. X = AZ+mu is the desired multivariate random vector.


"choleski(Sigma)" or "chol(Sigma)" give "R'R" where R is upper triangular. 
Thus take A=R'.

>  Sigma <- matrix(c(4,1,2,1,9,-3,2,-3,25),ncol=3)
> Sigma
     [,1] [,2] [,3] 
[1,]    4    1    2
[2,]    1    9   -3
[3,]    2   -3   25

> R <- chol(Sigma)
> R
     [,1]    [,2]      [,3] 
[1,]    2 0.50000  1.000000
[2,]    0 2.95804 -1.183216    #Upper triangular.
[3,]    0 0.00000  4.753946
attr(, "rank"):
[1] 3


> A <- t(R)
> A
     [,1]      [,2]     [,3] 
[1,]  2.0  0.000000 0.000000
[2,]  0.5  2.958040 0.000000   #Lower triangular.
[3,]  1.0 -1.183216 4.753946


> A%*%t(A)
     [,1] [,2] [,3] 
[1,]    4    1    2   #OK, get Sigma back.
[2,]    1    9   -3
[3,]    2   -3   25

--------------

Generate n N(mu,Sigma) vectors:

rMultNorm <- function(n,mu,Sigma){
Z <- matrix(rnorm(length(mu)*n), ncol=n)
A <- t(chol(Sigma))
X <- t(A%*%Z +mu) #Rows are the obsevations, columns are variables
X}                #as required by cor(.), var(.).


> rMultNorm(7,c(0,0,0),Sigma)
           [,1]       [,2]        [,3] 
[1,] -4.3554456 -7.7590670 -14.8477570
[2,] -0.7256142 -2.7451099   0.7491262
[3,]  0.7140383 -1.0252797  -6.4598310     #7 vectors 1 by 3
[4,]  0.4750912  0.8717682  -0.8407338
[5,]  0.8533848  0.1425210  -5.7321204
[6,] -2.7933969  0.4403477  -4.8940260
[7,]  2.2688835 -5.0602202   3.1075679

Check by sample cov matrix:

> var(rMultNorm(150,c(0,0,0),Sigma))
         [,1]      [,2]       [,3]            #True Sigma:
[1,] 4.580810  1.668798   2.501892            # 4    1    2
[2,] 1.668798  9.534287  -2.927612            # 1    9   -3         
[3,] 2.501892 -2.927612  24.291489            # 2   -3   25


> var(rMultNorm(1000,c(0,0,0),Sigma))
         [,1]      [,2]       [,3] 
[1,] 3.659185  1.015151   1.950867            #n=1000 vectors
[2,] 1.015151  9.233407  -3.643383
[3,] 1.950867 -3.643383  26.379103


Check by means and histograms:

X <- rMultNorm(1000,c(0,0,0),Sigma)

> apply(X, 2, mean)                              
[1] -0.02826893  0.01457916  0.02276847


p <- ppoints(300) 

hist(X[,1], den=12,prob=T, main="N(0,4)")
lines(qnorm(p,0,2),dnorm(qnorm(p,0,2),0,2))

hist(X[,2], den=12,prob=T, main="N(0,9)")
lines(qnorm(p,0,3),dnorm(qnorm(p,0,3),0,3))

hist(X[,3], den=12,prob=T, main="N(0,5)")
lines(qnorm(p,0,5),dnorm(qnorm(p,0,5),0,5))