R Script for Wednesday January 26, 2022 on Data Display and Preliminary Data Manipulations ======================================================= 1/26/22 IN THIS SCRIPT, ALL R COMMANDS ARE FROM base R Datasets: banknote ## there are a number of simulated multivariate datasets in the package "mclust", # but the real datasets are: # thyroid (215x6), Wisconsin diagnostic Breast Cancer (wdbc, 569x32) # and banknote, GvHD, acidity, EuroUnemployment > table(thyroid$Diagnosis) Hypo Normal Hyper 30 150 35 > table(wdbc$Diagnosis) B M ## Benign vs Malignant 357 212 > dim(GvHD.control) ## variable names "CD4" "CD8b" "CD3" "CD8" [1] 6809 4 > dim(GvHD.pos) [1] 9083 4 > dim(banknote) [1] 200 7 > names(banknote) [1] "Status" "Length" "Left" "Right" "Bottom" "Top" "Diagonal" > table(banknote$Status) counterfeit genuine 100 100 #================================================================= # Here are data displays and manipulations on the banknote data, # taking some ideas for displays from # W. Haerdle and L. Simar, Applied Multivariate Statistical Analysis, 2003 Springer. #----------------------------------------------------------------- > library(mclust) ## loads the package, but not into the workspace > data(banknote) ## loads the dataset into the workspace > dim(banknote) [1] 200 7 > names(banknote) [1] "Status" "Length" "Left" "Right" "Bottom" "Top" "Diagonal" > table(banknote$Status) counterfeit genuine 100 100 ## BOXPLOTS > boxplot(banknote$Diagonal[banknote$Status=="counterfeit"], banknote$Diagonal[banknote$Status=="genuine"]) ### shows clearly larger diagonal for "genuine" -- explain boxplot > aggregate(banknote$Diagonal, by=banknote$Status, summary) Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max. 1 counterfeit 137.800 139.200 139.500 139.450 139.800 140.600 2 genuine 139.600 141.200 141.500 141.517 141.800 142.400 > boxplot(banknote$Right[banknote$Status=="counterfeit"], banknote$Right[banknote$Status=="genuine"]) ## so mostly the Right variable distinguishes counterfet from genuine, but not perfectly ## HISTOGRAMS ## Can use this to set up histogram picture and sub-plot histogram by type ## Note: to use histograms overlaid, relative-frequency histograms must be re-scaled > hist(banknote$Right, prob=T, xlab="Right, mm", main= "Histogram for Right Measurement on Banknotes") > hist(banknote$Right, prob=T, xlab="Right, mm", main= "Histogram for Right Measurement on Banknotes", ylim=c(0,2), density=0, border=NA) hist(banknote$Right[banknote$Status=="counterfeit"], prob=T, col="cyan", density=15, add=T) hist(banknote$Right[banknote$Status=="genuine"], prob=T, col="orange", density=15, angle = 135, add=T) ## LOTS OF VARIABLES AT ONCE > pairs(banknote[,-1]) > cor(banknote[,-1]) > round(cor(banknote[,-1]), 3) Length Left Right Bottom Top Diagonal Length Left Right Bottom Top Diagonal Length 1.000 0.231 0.152 -0.190 -0.061 0.194 Left 0.231 1.000 0.743 0.414 0.362 -0.503 Right 0.152 0.743 1.000 0.487 0.401 -0.516 Bottom -0.190 0.414 0.487 1.000 0.142 -0.623 Top -0.061 0.362 0.401 0.142 1.000 -0.594 Diagonal 0.194 -0.503 -0.516 -0.623 -0.594 1.000 > pairs(banknote[banknote$Status=="genuine",-1], main="GENUINE") dev.new() pairs(banknote[banknote$Status=="counterfeit",-1], main="COUNTERFEIT") dev.off();dev.off() > round(cor(banknote[banknote$Status=="counterfeit",-1]), 3) Length Left Right Bottom Top Diagonal Length 1.000 0.351 0.229 -0.252 0.087 0.059 Left 0.351 1.000 0.615 -0.083 -0.073 -0.035 Right 0.229 0.615 1.000 -0.055 0.001 0.206 Bottom -0.252 -0.083 -0.055 1.000 -0.681 0.378 Top 0.087 -0.073 0.001 -0.681 1.000 -0.062 Diagonal 0.059 -0.035 0.206 0.378 -0.062 1.000 > round(cor(banknote[banknote$Status=="genuine",-1]), 3) Length Left Right Bottom Top Diagonal Length 1.000 0.411 0.416 0.229 0.057 0.032 Left 0.411 1.000 0.664 0.242 0.208 -0.265 Right 0.416 0.664 1.000 0.255 0.133 -0.150 Bottom 0.229 0.242 0.255 1.000 -0.632 -0.001 Top 0.057 0.208 0.133 -0.632 1.000 -0.260 Diagonal 0.032 -0.265 -0.150 -0.001 -0.260 1.000 ## TO DISPLAY MANY PLOTS OF VARIABLES FROM ONE GROUP AGAINST ANOTHER: > par(mfrow=c(2,3)) for(j in 2:7) plot(banknote[banknote$Status=="counterfeit",j], banknote[banknote$Status=="genuine",j], xlab="Counterfeit", ylab="Genuine", main=paste0("Variable: ",names(banknote)[j])) ## You have to examine the ranges n the axes carefully to get information from the plot. ## To be more useful, the range should be the same for both variables > for(j in 2:7) plot(banknote[banknote$Status=="counterfeit",j], banknote[banknote$Status=="genuine",j], xlab="Counterfeit", ylab="Genuine", xlim=range(banknote[,j]), ylim=range(banknote[,j]), main=paste0("Variable: ",names(banknote)[j])) ## I still like the boxplots better for visual effect > for(j in 2:7) boxplot(banknote[banknote$Status=="counterfeit",j], banknote[banknote$Status=="genuine",j], xlab=names(banknote)[j], main = paste0(names(banknote)[j]," BoxPlot")) ## Histograms are OK too if there are not too many of them: > for(j in 2:7) { hist(banknote[,j], nclass=10, xlab="Right, mm", main= paste0( "Histogram for ",names(banknote)[j]), density=0, border=NA) hist(banknote[banknote$Status=="counterfeit",j], nclass=10, col="cyan", density=15, add=T) hist(banknote[banknote$Status=="genuine",j], nclass=10, col="orange", density=15, angle = 135, add=T) } ## NB here we abandoned prob=T: otherwise it is a little difficult to get # the vertical scaling right for all 6 variables. > par(mfrow=c(1,1)) ### to restore plot-window to plot single graph ### Projections, in fixed or random pairs of directions: tmpmat = data.matrix(banknote[,-1]) %*% cbind( runitvec(6), runitvec(6)) plot(tmpmat, xlab="random direction x", ylab="random direction y", type="n") points(tmpmat[banknote$Status=="genuine",], pch=5) points(tmpmat[banknote$Status=="counterfeit",], pch=18) ### Generally (not always) gives a very clear sense that the two clouds of points # can be separated perfectly by hyperplanes !! runitvec = function(ndim, seed=NULL) { if(!is.null(seed)) set.seed(seed) x = rnorm(ndim) x/sqrt(sum(x*x)) } Ran.Proj = function(dat, set1=1:(nrow(dat)%/%2), seed=NULL) { if(!is.null(seed)) set.seed(seed) if(is.data.frame(dat)) dat = data.matrix(dat) k = ncol(dat) Pmat = cbind( runitvec(k), runitvec(k)) tmpmat = dat %*% Pmat plot(tmpmat, xlab="random direction x", ylab="random direction y", type="n") points(tmpmat[set1,], pch=5) points(tmpmat[setdiff(1:nrow(dat),set1),], pch=18) Pmat } tmp= Ran.Proj(banknote[,-1],1:100) ## do this several times, mostly we perfectly separate the two classes of currency! tmpmat = data.matrix(banknote[,c(4,6)]) plot(tmpmat, xlab=names(banknote)[4], ylab=names(banknote)[6], type="n") points(tmpmat[banknote$Status=="genuine",], pch=5) points(tmpmat[banknote$Status=="counterfeit",], pch=18) tmpmat = data.matrix(banknote[,c(5,7)]) plot(tmpmat, xlab=names(banknote)[5], ylab=names(banknote)[7], type="n") points(tmpmat[banknote$Status=="genuine",], pch=5) points(tmpmat[banknote$Status=="counterfeit",], pch=18) ## The random projections above were done without centering and scaling: > apply(banknote[,-1],2,summary) Length Left Right Bottom Top Diagonal Min. 213.800 129.0000 129.0000 7.2000 7.7000 137.8000 1st Qu. 214.600 129.9000 129.7000 8.2000 10.1000 139.5000 Median 214.900 130.2000 130.0000 9.1000 10.6000 140.4500 Mean 214.896 130.1215 129.9565 9.4175 10.6505 140.4835 3rd Qu. 215.100 130.4000 130.2250 10.6000 11.2000 141.5000 Max. 216.300 131.0000 131.1000 12.7000 12.3000 142.4000 ## But that should not make too much difference in understanding separability # of the two point clouds by hyperplanes. (Why not ?) # Check this by applying the random projections after first centering > bankaux = data.matrix(banknote[,-1]) - rep(1,200) %o% apply(banknote[,-1],2,mean) ## and we could scale too if we wanted ... makes a little difference, not much. ########################################################################## # Many other displays are possible by first replacing the data in each variable # by ranks within the complete dataset: > bankrank = apply(data.matrix(banknote[,-1]), 2, rank) > bankrank[(1:10)*10,] Length Left Right Bottom Top Diagonal [1,] 162.5 149.5 160.0 103.0 41.5 103.5 [2,] 60.5 110.0 84.5 73.5 41.5 181.0 [3,] 81.0 31.0 11.5 60.0 4.5 187.5 [4,] 2.5 129.5 1.0 47.5 23.0 129.0 [5,] 26.5 1.0 41.0 24.0 29.0 187.5 [6,] 145.0 77.0 71.5 53.5 68.0 136.0 [7,] 104.0 110.0 141.5 41.0 152.5 57.0 [8,] 43.5 59.0 22.0 31.0 41.5 173.0 [9,] 124.5 59.0 56.0 41.0 83.5 187.5 [10,] 60.5 77.0 22.0 24.0 41.5 123.5 > par(mfrow=c(2,3)) for(j in 1:6) plot(bankrank[banknote$Status=="counterfeit",j], bankrank[banknote$Status=="genuine",j], xlab="Counterfeit", ylab="Genuine", xlim=c(1,200), ylim=c(1,200), main=paste0("Variable: ",names(banknote)[j])) ## Compare: "Pearson" correlations based on direct observations with # those based on ranks ("Spearman method") > round(cor(banknote[banknote$Status=="counterfeit",-1], method="spearman"),3) Length Left Right Bottom Top Diagonal Length 1.000 0.367 0.222 -0.254 0.133 0.153 Left 0.367 1.000 0.560 -0.086 -0.069 0.070 Right 0.222 0.560 1.000 -0.118 -0.001 0.262 Bottom -0.254 -0.086 -0.118 1.000 -0.714 0.179 Top 0.133 -0.069 -0.001 -0.714 1.000 -0.045 Diagonal 0.153 0.070 0.262 0.179 -0.045 1.000 > round(cor(banknote[banknote$Status=="counterfeit",-1], method="pearson"),3) Length Left Right Bottom Top Diagonal Length 1.000 0.351 0.229 -0.252 0.087 0.059 Left 0.351 1.000 0.615 -0.083 -0.073 -0.035 Right 0.229 0.615 1.000 -0.055 0.001 0.206 Bottom -0.252 -0.083 -0.055 1.000 -0.681 0.378 Top 0.087 -0.073 0.001 -0.681 1.000 -0.062 Diagonal 0.059 -0.035 0.206 0.378 -0.062 1.000