STAT 705, Fall 2010 Monte Carlo Generation of Ranodom Variables with R ====================================================== Let F(x) be cdf, and U ~ Unif[0,1]. Define: X = minimum {x: F(x) >= U} = F^{-1}(U) Then X ~ F(x). 0. R Functions for generating data --------------------------------------- runif(n), rnorm(n,0,1), rbinom(n,n,p), rpois(n, lambda),... 1. Generating X ~ P(X=j)=1/6, j=1,2,3,4,5,6 -------------------------------------------------- Use floor(x): greatest integer leq x. Example: Suppose U=3/12= 1/6 + 1/12 ===> 1/6< U <2/6 ===> X=2 Get this from: floor(6*3/12 +1)=floor(2.5)=2, etc. X <- 1:120 #Generation of 120 X's count <- rep(0,6) #Input six 0's for(i in 1:120){ X[i] <- floor(6*runif(1)+1) #floor(x)= Greatest integer <= x count[X[i]] <- count[X[i]] + 1} > X [1] 6 6 1 5 5 4 6 6 4 3 3 2 6 3 6 2 4 3 1 2 2 2 5 4 6 3 4 6 1 5 3 3 5 1 2 6 4 [38] 5 6 5 3 1 5 2 5 1 1 5 5 6 3 2 5 4 3 4 6 1 2 5 2 4 3 3 4 5 6 6 1 6 5 1 6 6 [75] 1 3 4 6 3 1 1 6 3 6 5 2 4 4 5 3 1 2 3 4 1 4 2 5 2 4 5 1 2 2 6 4 5 2 1 4 3 [112] 4 6 5 6 5 1 5 6 4 > count [1] 18 17 18 20 23 24 #Barplot: par(oma=c(0,0,4,0)) #For Overall Title par(mfrow=c(1,2)) names(count) <- c("1","2","3","4","5","6") barplot(count, names=names(count)) #Simulation I title("18 17 18 20 23 24 ") names(count) <- c("1","2","3","4","5","6") #Simulation II barplot(count, names=names(count)) title("25 18 14 27 13 23 ") mtext("FIG 1: 120 Die Tossings", side=3,outer=T, cex=1.5) #Overall Title -------------------------------------------- More general: Generate n RV's from P(X=j)=1/6, j=1,...,6. Add Chi-Square Goodness of Fit Test. Die <- function(n){ X <- 1:n count <- rep(0,6) for(i in 1:n){ X[i] <- floor(6*runif(1)+1) count[X[i]] <- count[X[i]] + 1} ChiSq <- sum((6/n)*(count-n/6)^2) pvalue <- 1-pchisq(ChiSq,5) list("Count"=count, "ChiSq"=ChiSq,"pvalue"=pvalue) } > qchisq(0.95,5) [1] 11.0705 #Critical value for alpha=0.05, df=5 > Die(120) $Count: [1] 20 21 22 18 19 20 $ChiSq: [1] 0.5 #OK $pvalue: [1] 0.9921233 > Die(120) $Count: [1] 16 22 20 17 24 21 $ChiSq: [1] 2.3 #OK $pvalue: [1] 0.8062669 > Die(120) $Count: [1] 21 29 10 25 15 20 $ChiSq: [1] 11.6 #Reject; not strongly. $pvalue: [1] 0.04069939 > unix.time(Die(2400)) user system elapsed 0.062 0.003 0.067 --------------------------------------------- @@@@@@@@@@@@ Efficient Program: No "for" Loops @@@@@@@@@@@ Die <- function(n){ X <- floor(6*runif(n)+1) f <- table(X) count <- c(f[[1]],f[[2]],f[[3]],f[[4]],f[[5]],f[[6]]) ChiSq <- sum((6/n)*(count-n/6)^2) pvalue <- 1-pchisq(ChiSq,5) list("X"=X, "Table"=f, "Count"=count, "ChiSq"=ChiSq, "pvalue"=pvalue)} > Die(120) $X [1] 6 5 6 6 2 1 3 2 3 6 4 5 1 6 6 4 1 4 4 5 6 5 2 1 6 1 4 2 3 1 5 4 6 3 4 3 5 [38] 3 3 6 1 4 1 3 3 5 3 1 1 2 5 1 1 3 1 6 4 5 1 6 4 3 5 3 6 5 1 3 6 5 2 1 3 6 [75] 3 1 1 3 2 3 5 3 5 3 3 1 1 5 4 5 3 1 1 3 1 2 2 5 2 4 4 1 2 6 4 1 6 6 1 5 3 [112] 3 1 5 6 6 3 2 3 2 $Table X 1 2 3 4 5 6 27 13 27 14 19 20 $Count [1] 27 13 27 14 19 20 $ChiSq [1] 9.2 $pvalue [1] 0.1013479 > Die(120) $X [1] 5 5 6 3 2 6 5 6 6 6 3 4 5 3 5 4 4 5 6 3 4 5 2 5 3 3 3 6 4 6 4 3 5 4 2 1 3 [38] 1 6 1 3 1 5 5 1 3 3 4 5 5 3 1 1 4 2 1 4 4 3 6 4 2 1 1 5 3 1 3 2 3 4 3 3 2 [75] 1 2 2 2 4 3 6 1 4 2 1 4 2 4 2 3 5 6 2 5 1 5 2 5 6 2 2 2 1 6 2 6 3 3 6 3 1 [112] 4 5 3 2 1 5 4 4 5 $Table X 1 2 3 4 5 6 18 20 25 20 21 16 $Count [1] 18 20 25 20 21 16 $ChiSq [1] 2.3 $pvalue [1] 0.8062669 ##Better > unix.time(Die(2400)) ###Faster here! user system elapsed 0.002 0.000 0.002 2. Generate Exponential(Lambda) RV's --------------------------------------- Exponential <- function(n,lambda){ U <- runif(n) X <- -(log(1-U))/lambda X} > Exponential(10,1) [1] 3.2354922 2.7719491 0.0451040 1.4465477 1.2236916 0.7002736 2.6401330 [8] 1.8342899 0.7920876 0.6690501 ##Or: Since U and 1-U Both Unif(0,1), we may replace 1-U by U Exponential <- function(n,lambda){ U <- runif(n) X <- -(log(U))/lambda list("lambda"=lambda, "X"=X) } > ExpData <- Exponential(1000,1) > ExpData$lambda [1] 1 > summary(ExpData$X) Min. 1st Qu. Median Mean 3rd Qu. Max. 3.433e-05 3.015e-01 6.435e-01 9.464e-01 1.254e+00 8.912e+00 > mean(ExpData$X) [1] 0.9464022 > var(ExpData$X) [1] 0.9790306 ##Check against rexp(1000,1) from R ExpData <- Exponential(1000,1) qqplot(ExpData$X,rexp(1000,1),main="QQ-Comparison") abline(0,1) 3. Mixtures -------------- Fraction of p customers replace their batteries at failure or one year of usage, whichever comes first. The remaining q=1-p customers replace their baterries at failure. We have: X ~ Exponential(lambda) Y= Min{X,1} Z= Y w.p. p and Z=X w.p. q ### Problem: Generate Z's. Note: To get Y= Min{X,1} can use: Y <- ifelse((X < 1), Y <- X, Y <- 1) or simply: Y <- pmin(X,1). But Y <- min(X,1) Is Not Good Here! > X <- rexp(10,1) > X [1] 1.0274276 1.6540601 3.8138664 2.1656609 0.2545461 3.5295724 1.6939928 [8] 0.2057234 0.9062887 1.7004219 > Y <- ifelse((X < 1), Y <- X, Y <- 1) > Y [1] 1.0000000 1.0000000 1.0000000 1.0000000 0.2545461 1.0000000 1.0000000 [8] 0.2057234 0.9062887 1.0000000 Or, simpler: > Y <- pmin(X,1) > Y [1] 1.0000000 1.0000000 1.0000000 1.0000000 0.2545461 1.0000000 1.0000000 [8] 0.2057234 0.9062887 1.0000000 But not this > Y <- min(X,1) > Y [1] 0.2057234 Method I ========= ExpoMix <- function(n,lambda,p){ U <- runif(n) #The first 2 lines can be replaced X <- -(log(1-U))/lambda #by "X <- rexp(n,lambda)" Y <- pmin(X,1) U <- runif(n) #Need to generate mixture indicator Z <- ifelse((U < p), Z <- Y, Z <- X) ProbZequalOne <- p*exp(-lambda) ##True prob P(Z=1) Error <- abs(ProbZequalOne - sum(Z ==1)/n) ##Check P(Z=1) HatProbZequalOne <- sum(Z ==1)/n ##Est of P(Z=1) list("ProbZequalOne"=ProbZequalOne, "HatProbZequalOne"=HatProbZequalOne, "Z"=Z, "Error"=Error)} > Mixture <- ExpoMix(1000,1,0.4) > Mixture$ProbZequalOne ##True P(Z=1) [1] 0.1471518 > Mixture$HatProbZequalOne ##Est of P(Z=1) [1] 0.144 > Mixture$Error [1] 0.003151776 > Mixture$Z[1:20] [1] 1.00000000 0.11535434 0.48823905 1.23766697 0.12342818 0.08362285 [7] 0.69099087 0.28763913 1.80842775 1.00000000 0.35905464 0.89413019 [13] 0.05253795 0.20578574 0.27968516 0.15701067 1.06495353 0.41765359 [19] 0.15329962 0.68511337 Method II: Check Method I. ========= Use the inverse of mixture df directly. Note Example: To get 0, if v < 4 Z = 40, if 4 <= v <= 7 20, if v > 7 Use Nested "ifelse" > v <- c(1,2,3,4,5,6,7,8,9) > z <- ifelse((v < 4), z <- 0,ifelse((v > 7), z <- 20, z <- 40)) [1] 0 0 0 40 40 40 40 20 20 Call the data ZZ. ExpoMixII <- function(n,lambda,p){ U <- runif(n) ZZ <- ifelse((0 <= U) & (U <= 1- exp(-lambda)), ZZ <- (-1/lambda)*log(1-U), ifelse((1- exp(-lambda) < U) & (U <= 1-(1-p)*exp(-lambda)), ZZ <- 1, ZZ <- (-1/lambda)*log((1-U)/(1-p)))) ProbZZequalOne <- p*exp(-lambda) Error <- abs(ProbZZequalOne - sum(ZZ ==1)/n) #Check P(ZZ=1) HatProbZZequalOne <- sum(ZZ ==1)/n list("HatProbZZequalOne"=HatProbZZequalOne, "ZZ"=ZZ,"Error"=Error) } MixtureII <- ExpoMixII(1000,1,0.4) > MixtureII$HatProbZZequalOne [1] 0.149 > MixtureII$Error [1] 0.001848224 > MixtureII$ZZ[1:20] [1] 0.18448242 0.17954087 0.07321502 3.49700058 0.07644807 1.51108284 [7] 0.37363428 1.42463626 1.38869435 0.44310912 1.00000000 0.62670673 [13] 0.29317864 0.26155529 0.38255592 0.92457030 1.08222099 1.01251498 [19] 0.72811314 0.87255661 Comparisons. ------------- par(mfrow=c(2,2)); par(oma=c(0,0,2,0)) #Overall Title qqplot(Mixture$Z,MixtureII$ZZ) #Compare the two methods: Z vs ZZ. abline(0,1) title("Two Methods: Z,ZZ") > EX <- rexp(1000,1) > YY <- pmin(EX,1) qqplot(Mixture$Z,EX) #Compare Z with EX ~ exponential(1) abline(0,1) title("Z, EXP(1)") qqplot(Mixture$Z,YY) #Compare Z with Y=min(EX,1) abline(0,1) title("Z, YY = min(EXP(1),1)") qqplot(EX,YY) abline(0,1) title("EXP(1), YY = min(EXP(1),1)") mtext("Comparison of Z,ZZ,EXP(1),YY = min(EXP(1),1)", side=3,outer=T) 4. Contamination (Also Mixiture!) ------------------------------------- "contam" is N(0,1) w.p. .9 and N(0,9) w.p. 0.1. Trick is to randomize the variance: 1 w.p. .9 and 3^2 w.p. 0.1. contam <- rnorm(200,0, 1+2*rbinom(200,1,0.1)) stem(contam) N = 200 Median = 0.0361128 Quartiles = -0.76384, 0.860758 Decimal point is at the colon -3 : 9 -3 : -2 : 776 -2 : 2110 -1 : 9887776555555 -1 : 4444443333222100 -0 : 99999888888888777777776666666555555555 -0 : 44443333332222221100 0 : 00000011122222222333333334444444444 0 : 55556666777777888889999999 1 : 000001111111122222233344 1 : 55667778889 2 : 01344 2 : 3 : 4 3 : 9 High: 4.89752 7.31264 #<---- Largest values of sample. 5. Generating Bivariate RV's --------------------------------------- U1,U2, iid Unif[0,1]. Assume we have F(x), F(y|x). Step 1: Use U1 to get X from F(x). Step 2. Use U2 to get Y from F(y|X=x) Then: (X,Y) ~ f(x,y) Example: Generate (X,Y) from f(x,y)=exp(-y), 0 result <- biv(1000) > result$corrXY [1] 0.7060981 #<-- OK: True Corr(X,Y)=sqrt(2)/2=0.7071068 plot(result$X,result$Y, main="f(x,y)=exp(-y), 0 result$corrXY [1] 0.3498812 #Incorrect correlation. plot(result$X,result$Y, main="U2=1-U1") --------------------- 6. Generating Multivariate Normal Random Vectors --------------------------------------------------- Scalar Case: Box-Muller algorithm generates N(0,1)'s from Unif[0,1]'s U1,U2 iid Unif[0,1] ====> Z1,Z2 iid N(0,1) By: Z1 <- sqrt(-2*log(U1))*cos(2*pi*U2) Z2 <- sqrt(-2*log(U1))*sin(2*pi*U2) > BoxMuller <- function(n) { U1 <- runif(n) U2 <- runif(n) Z <- sqrt(-2 * log(U1)) * cos(2 * pi * U2) #Suffices. #Z2 <- sqrt(-2 * log(U1)) * sin(2 * pi * U2) Z } Compare with "rnorm": Y <- BoxMuller(1000) Z <- rnorm(1000) qqplot(Z,Y) >abline(0,1) ------------ Vector Case: Suppose we wish to generate X ~ N(mu,Sigma), mu mean vector, Sigma pd. cov. matrix. Step 1. Sigma= AA', A lower triangular Step 2. Get Z ~ N(0,I) Step 3. X = AZ+mu is the desired multivariate random vector. "choleski(Sigma)" or "chol(Sigma)" give "R'R" where R is upper triangular. Thus take A=R'. > Sigma <- matrix(c(4,1,2,1,9,-3,2,-3,25),ncol=3) > Sigma [,1] [,2] [,3] [1,] 4 1 2 [2,] 1 9 -3 [3,] 2 -3 25 > R <- chol(Sigma) > R [,1] [,2] [,3] [1,] 2 0.50000 1.000000 [2,] 0 2.95804 -1.183216 #Upper triangular. [3,] 0 0.00000 4.753946 attr(, "rank"): [1] 3 > A <- t(R) > A [,1] [,2] [,3] [1,] 2.0 0.000000 0.000000 [2,] 0.5 2.958040 0.000000 #Lower triangular. [3,] 1.0 -1.183216 4.753946 > A%*%t(A) [,1] [,2] [,3] [1,] 4 1 2 #OK, get Sigma back. [2,] 1 9 -3 [3,] 2 -3 25 -------------- Generate n N(mu,Sigma) vectors: rMultNorm <- function(n,mu,Sigma){ Z <- matrix(rnorm(length(mu)*n), ncol=n) A <- t(chol(Sigma)) X <- t(A%*%Z +mu) #Rows are the obsevations, columns are variables X} #as required by cor(.), var(.). > rMultNorm(7,c(0,0,0),Sigma) [,1] [,2] [,3] [1,] -4.3554456 -7.7590670 -14.8477570 [2,] -0.7256142 -2.7451099 0.7491262 [3,] 0.7140383 -1.0252797 -6.4598310 #7 vectors 1 by 3 [4,] 0.4750912 0.8717682 -0.8407338 [5,] 0.8533848 0.1425210 -5.7321204 [6,] -2.7933969 0.4403477 -4.8940260 [7,] 2.2688835 -5.0602202 3.1075679 Check by sample cov matrix: > var(rMultNorm(150,c(0,0,0),Sigma)) [,1] [,2] [,3] #True Sigma: [1,] 4.580810 1.668798 2.501892 # 4 1 2 [2,] 1.668798 9.534287 -2.927612 # 1 9 -3 [3,] 2.501892 -2.927612 24.291489 # 2 -3 25 > var(rMultNorm(1000,c(0,0,0),Sigma)) [,1] [,2] [,3] [1,] 3.659185 1.015151 1.950867 #n=1000 vectors [2,] 1.015151 9.233407 -3.643383 [3,] 1.950867 -3.643383 26.379103 Check by means and histograms: X <- rMultNorm(1000,c(0,0,0),Sigma) > apply(X, 2, mean) [1] -0.02826893 0.01457916 0.02276847 p <- ppoints(300) hist(X[,1], den=12,prob=T, main="N(0,4)") lines(qnorm(p,0,2),dnorm(qnorm(p,0,2),0,2)) hist(X[,2], den=12,prob=T, main="N(0,9)") lines(qnorm(p,0,3),dnorm(qnorm(p,0,3),0,3)) hist(X[,3], den=12,prob=T, main="N(0,5)") lines(qnorm(p,0,5),dnorm(qnorm(p,0,5),0,5))