Summer Math Institute Prob/Stat Session R Demo Log: ================================================== [start R running in Rdemos folder] source("Demo1A.txt") source("Demo1B.txt") source("Demo1C.txt") source("Demo2.txt") CircP(1e5) CircP(1e6) CircP(1e6, 1/9) CondProb(.2,.6,.3,.8) CondProb(.2,.6,.3,.8,"C") CondProb(.2,.6,.3,.8,"T") CondProb(.2,.6,.5,.9,"T", plot=T) CondProb(.3,.5,.6,.9,"C", plot=T) ### Try some other possiblities, e.g.: > CondProb(.5,.7,.4,.8,"T", N=1e5, plot=T) Npt Np.Yrang Cprob 4.985700e+04 2.366600e+04 1.690188e-01 source("CumPoker.txt") > HistDemo(1e4, type="b") > HistDemo(1e3, type="g") > HistDemo(1e5, L=100) source("PermShoe.txt") ------------------------------------------------------- Demo1A.txt: a = 7^5 b = 0 m = 2^31-1 plot(0,0, xlim=c(0,1), ylim=c(0,1), xlab="x_n value", ylab = "x_{n+1}", main = paste( "Scatterplot of every 10th (U_n, U_{n+1})", "\n", "for Park & Miller 1988 Pseudo RNG"), type="n") xval = numeric(101) xval[1] = trunc(runif(1)*2^32) for(i in 100:1e4) xval[1+i%/%100] = runif(1) for(i in 1:1e3) { for(j in 1:100) xval[j+1] = (a*xval[j]) %% m points(xval[1+(0:9)*10]/2^31,xval[2+(0:9)*10]/2^31) xval[1] = xval[101] } Demo1B.txt: plot(0,0, xlim=c(0,1), ylim=c(0,1), xlab="x_n value", ylab = "x_{n+1}", main = paste( "Scatterplot of every (U_n,U_{n+1})", "\n", "for R package RNG"), type="n") for(i in 1:1e4) x = runif(1) for(i in 1:1e4) { y = runif(1) points(x,y) x=y } Demo1C.txt: xval = runif(1e5) for(i in 1:100) { for(j in 1:1000) x = runif(1) hist(xval[1:(i*1000)], nclass=25, prob=T, main=paste("Histograms from Cumulatively Updated R RNG", "\n","(Relative Frequency Histograms)")) } ----------------------------------------------------------- > Cards = c(2:10,"J","Q","K","A") > Cards [1] "2" "3" "4" "5" "6" "7" "8" "9" "10" "J" "Q" "K" "A" > Suit = c("Cl","Di","He","Sp") > Poker = outer(Cards,Suit, function(x,y) paste(x,y,sep=".")) > Poker [,1] [,2] [,3] [,4] [1,] "2.Cl" "2.Di" "2.He" "2.Sp" [2,] "3.Cl" "3.Di" "3.He" "3.Sp" [3,] "4.Cl" "4.Di" "4.He" "4.Sp" [4,] "5.Cl" "5.Di" "5.He" "5.Sp" [5,] "6.Cl" "6.Di" "6.He" "6.Sp" [6,] "7.Cl" "7.Di" "7.He" "7.Sp" [7,] "8.Cl" "8.Di" "8.He" "8.Sp" [8,] "9.Cl" "9.Di" "9.He" "9.Sp" [9,] "10.Cl" "10.Di" "10.He" "10.Sp" [10,] "J.Cl" "J.Di" "J.He" "J.Sp" [11,] "Q.Cl" "Q.Di" "Q.He" "Q.Sp" [12,] "K.Cl" "K.Di" "K.He" "K.Sp" [13,] "A.Cl" "A.Di" "A.He" "A.Sp" Tally = array(1e5) for(i in 1:1e5) { Xvec = sample(52, 5) Xcard = 1 + (Xvec-1) %% 13 Xsuit = (Xvec+12) %/% 13 Tally[i] = sum(table(Xcard)==2)==2 } > sum(Tally) [1] 4793 ### divide by 1e5 to get estimated prob = .04793 ### Compare with prob of 2pr = (120/prod(48:52))*(13*6*6^2*44) [1] 0.04753902 ### This takes time [5+ minutes!] because of slow looping in R ! ### To speed up: make an array of 1e5 by 10 of 1:52 ### Go across rows to find the first 5 distinct ones ### Parallelize reduction to card values Xarr = array(sample(52,1e6, rep=T), c(1e5,10)) ### NB: do not always get 5 unique in rows of 8 !! Xarr2 = apply(Xarr,1, function(row) unique(row)[1:5]) ## ~20secs Xarr2 = 1 + (Xarr2-1) %% 13 ### < 1 sec Tally = apply(Xarr2,2,function(col) sum(table(col)==2)) > table(Xarr2) Xarr2 1 2 3 4 5 6 7 8 9 10 11 12 13 38485 38467 38628 38515 38568 38344 38150 38240 38421 38330 38860 38431 38561 Tally = apply(Xarr2, 1, function(seq) sum(table(unique(seq)[1:5])==2)) ### This was faster, maybe 3 minutes instead of 7 > table(Tally) Tally ## 2nd run gives estimated prob = .04827 0 1 2 52669 42504 4827 ## Now for 3rd run: Xarr = array(sample(52,1e6, rep=T), c(1e5,10)) ### NB: do not always get 5 unique in rows of 8 !! Xarr2 = apply(Xarr,1, function(row) unique(row)[1:5]) ## ~20secs Xarr2 = 1 + (Xarr2-1) %% 13 ### < 1 sec Tally = apply(Xarr2,2,function(col) sum(table(col)==2)) > table(Tally) Tally 0 1 2 52880 42341 4779 --------------------------------------------- Geometric Probability Runs > CircP = function(N, rad.sq = .25) { Dvec=(runif(N)-.5)^2+(runif(N)-.5)^2 c(InCirc = mean(Dvec < rad.sq), AvDistSq= mean(Dvec)) } > CircP(1e5) InCirc AvDistSq 0.7871300 0.1662735 > CircP(1e5) InCirc AvDistSq 0.7855700 0.1664821 > CircP(1e5) InCirc AvDistSq 0.785860 0.166589 > CircP(1e6) InCirc AvDistSq 0.7849080 0.1668259 ### Do these in class ! ### Same function to find fraction within circle of ### radius 1/3 of center (1/2, 1/2): > CircP(1e6, 1/9) InCirc AvDistSq 0.3491230 0.1667787 > CircP(1e6, 1/9) InCirc AvDistSq 0.349002 0.166720 > CircP(1e6, 1/9) InCirc AvDistSq 0.3490410 0.1665911 #### > pi/9 [1] 0.3490659 ----------------------------------------- Demo2: Plotting Geometric-Prob Runs: > Uvec = runif(1e3) Vvec = runif(1e3) inds = (1:1e3)[(Uvec-.5)^2+(Vvec-.5)^2 < .25] plot(Uvec[inds],Vvec[inds], xlab="X",ylab="Y", main=paste( "Random Points in Square Falling in Inscribed Circle","\n", "With Smaller Radius 1/3 Circle Plotted"), xlim=c(0,1), ylim=c(0,1)) lines(.5+(1/3)*cos((1:1000)*pi/500), .5+(1/3)*sin((1:1000)*pi/500), lty=1) -------------------------------------------- Conditional prob. X in (.2,.6) given Y in (.3,.8) A) if (X,Y) random in the square B) if (X,Y) random in the circle \ $(X-.5)^2+(Y-.5)^2<0.25$ C) if (X,Y) random in the triangle \ $X < Y$ \ei CondProb= function(a,b,y1,y2, type="S", N=1e6, plot=F) { ### type "S" for square, "C" circle, or "T" triangle ### function finds rel freq U in (a,b) among cases with ### (U,V) in specified region and V in specified interval Uvec=runif(N) Vvec=runif(N) inds0 = if(type=="S") rep(TRUE,N) else {if(type=="C") (Uvec-.5)^2+(Vvec-.5)^2<.25 else Uvec < Vvec } inds1 = inds0 & (Vvec > y1 & Vvec <= y2) if(plot) { inds2 = inds1 & ((1:N) %% 20 == 0) plot(Uvec[inds2],Vvec[inds2], xlab="X", ylab="Y", main=paste("Plot of Random Points in Chosen Region", "\n","with Y in (",round(y1,1),",",round(y2,1),")", sep="")) abline(v=a, lty=2) abline(v=b, lty=2) } c(Npt=sum(inds0), Np.Yrang=sum(inds1), Cprob= mean(Uvec[inds1] > a & Uvec[inds1] <= b)) } > CondProb(.2,.6,.3,.8) ### Do these in class: Npt Np.Yrang Cprob 1.000000e+06 5.008280e+05 4.006445e-01 > CondProb(.2,.6,.3,.8,"C") Npt Np.Yrang Cprob 7.850290e+05 4.749670e+05 4.204734e-01 > CondProb(.2,.6,.3,.8,"T") Npt Np.Yrang Cprob 5.004190e+05 2.751520e+05 5.655165e-01 CumPoker.txt Tally = array(1e4) plot(0,0, xlab="No. Hands n", ylab="Rel.Freq", xlim=c(0,1e4), ylim=c(0, .08), type="n", main=paste( "Plot of Relative Frequency Versus n of","\n", "2-Pair Poker Hands")) TwPr = 0.04753902 abline(h=TwPr, lty=4) for(i in 1:100) { for(j in 1:100) { Xvec = sample(52, 5) Xcard = 1 + (Xvec-1) %% 13 Tally[(i-1)*100 +j] = sum(table(Xcard)==2)==2 } points(i*100, mean(Tally[1:(i*100)]), pch=5) } ---------------------------------------------------------- HistPlot.txt plot(seq(0,1,.01),dbeta(seq(0,1,.01),2,3), xlim=c(0,1), type="n", xlab="X value", ylab="Density", main=paste( "Plot of Single-Cell Histogram Bar & Density Seqment", "\n","over the interval (0.4, 0.6]")) abline(v=.4, lty=5) abline(v=.6, lty=5) lines(seq(.4,.6,.01), dbeta(seq(.4,.6,.01), 2, 3)) text(rep(0.18,4),c(1.2, 1.0, 0.8,0.7), labels=c( "Curve = density", "Solid horiz.= avg height", "Dashed horiz.= Scaled", "Hist. for rel freq.= .24")) lines(c(.4,.6),c(1.2,1.2), lty=4) lines(c(.4,.6),rep((pbeta(.6,2,3)-pbeta(.4,2,3))/.2,2), lty=1) ## Histogram Demo: HistDemo = function(N, L=40, type="n", parm=c(3,2)) { ### type = "n" for "norm", "b" for "beta", "g" for "gamma" if(type=="n") { dfcn = function(x) dnorm(x,parm[1],parm[2]) rfcn = function(x) rnorm(x,parm[1],parm[2]) labl = paste("Normal(",parm[1],",",parm[2]^2,")",sep="") } else { if(type=="b") { dfcn = function(x) dbeta(x,parm[1],parm[2]) rfcn = function(x) rbeta(x,parm[1],parm[2]) labl = paste("Beta(",parm[1],",",parm[2]^2,")",sep="") } else { dfcn = function(x) dgamma(x,parm[1],parm[2]) rfcn = function(x) rgamma(x,parm[1],parm[2]) labl = paste("Gamma(",parm[1],",",parm[2]^2,")",sep="") } } data=sort(rfcn(N)) hist(data, nclass=L, prob=T, main=paste( "Scaled Histogram & Density for ",labl," Data","\n", "with N=",N," points and ",L," class intervals",sep="")) xpts = seq(data[1],data[N],length=150) lines(xpts, dfcn(xpts), lty=4) } Example runs: > HistDemo(1e4, type="b") > HistDemo(1e3, type="g") > HistDemo(1e5, L=100) ------------------------------------------------- Two Sample Permutational Test based on Box, Hunter, Hunter shoe-wear data (2gps, 10 each) > lapply(shoes, function(row) c(mean=mean(row),std.dev=sd(row))) $A mean std.dev 10.630000 2.451326 $B mean std.dev 11.040000 2.518465 ### consider permutations of 20 > combShoe = unlist(shoes) ### there are 184756 assignments of the twenty into two groups ### but we can sample Monte Carlo to learn hist of GpB minus GpA ### means: for the orginal groups, difference is 0.41 > sum(combShoe) [1] 216.7 ## so for each sampled group of 10 (regarded as B) ## can take mean(newB) - 0.1*(216.7-10*mean(newB) = 2*mean(newB)-21.67 > mean(abs(MeanDiff) > 0.41) [1] 0.7894 PermShoe.txt MeanDiff = numeric(2e4) for(i in 1:2e4) MeanDiff[i] = 2*mean(sample(combShoe, 10, rep=T))-21.67 > hist(MeanDiff, nclass=25, xlab="Mean Diff", prob=T, main=paste( "Histogram of Permutational Mean Gp Differences","\n", "Proportion > .41 =", mean(abs(MeanDiff) > 0.41))) abline(v=0.41) NonParametric Bootstrap Standard Dev of IQR for moderate sample of multimodal "galaxies" data: > sum(c(-1,1)*sort(galaxies)[c(21,61)]) [1] 3385 ### Now want variability IQRvec = numeric(0) for(i in 1:1e4) IQRvec[i] = sum(c(-1,1)*sort(sample(galaxies,82, rep=T))[c(21,61)]) > sd(IQRvec) [1] 405.8119 > hist(galaxies, nclass=16, main=paste( "Histogram of 82 Galaxies, IQR = 3385","\n", "Std. Dev. of 1e4 IQR's from Sample With Rep = 405.81"))