R Script for Wednesday January 26, 2022 on Data Display 
   and Preliminary Data Manipulations
=======================================================     1/26/22
    IN THIS SCRIPT, ALL R COMMANDS ARE FROM  base R

Datasets:

banknote

## there are a number of simulated multivariate datasets in the package "mclust", 
#   but the real datasets are:  
#    thyroid (215x6), Wisconsin diagnostic Breast Cancer (wdbc, 569x32)
#   and  banknote,  GvHD, acidity, EuroUnemployment

> table(thyroid$Diagnosis)
  Hypo Normal  Hyper 
    30    150     35 

> table(wdbc$Diagnosis)
  B   M                 ## Benign vs Malignant
357 212 

> dim(GvHD.control)    ## variable names  "CD4"  "CD8b" "CD3"  "CD8" 
[1] 6809    4
> dim(GvHD.pos)
[1] 9083    4


> dim(banknote)
[1] 200   7
> names(banknote)
[1] "Status"   "Length"   "Left"     "Right"    "Bottom"   "Top"      "Diagonal"
> table(banknote$Status)

counterfeit     genuine 
        100         100 

#=================================================================
# Here are data displays and manipulations on the  banknote  data,  
#     taking some ideas for displays from 
# W. Haerdle and L. Simar, Applied Multivariate Statistical Analysis, 2003 Springer.
#-----------------------------------------------------------------

> library(mclust)     ## loads the package, but not into the workspace
> data(banknote)      ## loads the dataset into the workspace
> dim(banknote)
[1] 200   7 
> names(banknote)
[1] "Status"   "Length"   "Left"     "Right"    "Bottom"   "Top"      "Diagonal"
> table(banknote$Status)

counterfeit     genuine 
        100         100 

## BOXPLOTS

> boxplot(banknote$Diagonal[banknote$Status=="counterfeit"],
          banknote$Diagonal[banknote$Status=="genuine"])
   ### shows clearly larger diagonal for "genuine" -- explain boxplot

> aggregate(banknote$Diagonal, by=banknote$Status, summary)
      Group.1  x.Min. x.1st Qu. x.Median  x.Mean x.3rd Qu.  x.Max.
1 counterfeit 137.800   139.200  139.500 139.450   139.800 140.600
2     genuine 139.600   141.200  141.500 141.517   141.800 142.400

> boxplot(banknote$Right[banknote$Status=="counterfeit"],
          banknote$Right[banknote$Status=="genuine"])
   ## so mostly the Right variable distinguishes counterfet from genuine, but not perfectly

## HISTOGRAMS

## Can use this to set up histogram picture and sub-plot histogram by type
## Note: to use histograms overlaid, relative-frequency histograms must be re-scaled

> hist(banknote$Right, prob=T, xlab="Right, mm", main=
   "Histogram for Right Measurement on Banknotes")

> hist(banknote$Right, prob=T, xlab="Right, mm", main=
    "Histogram for Right Measurement on Banknotes", ylim=c(0,2), 
    density=0, border=NA)
  hist(banknote$Right[banknote$Status=="counterfeit"], prob=T, col="cyan", 
       density=15, add=T)
  hist(banknote$Right[banknote$Status=="genuine"], prob=T, col="orange", 
       density=15, angle = 135, add=T)


## LOTS OF VARIABLES AT ONCE

> pairs(banknote[,-1])

> cor(banknote[,-1])

> round(cor(banknote[,-1]), 3)
              Length       Left      Right     Bottom         Top   Diagonal
         Length   Left  Right Bottom    Top Diagonal
Length    1.000  0.231  0.152 -0.190 -0.061    0.194
Left      0.231  1.000  0.743  0.414  0.362   -0.503
Right     0.152  0.743  1.000  0.487  0.401   -0.516
Bottom   -0.190  0.414  0.487  1.000  0.142   -0.623
Top      -0.061  0.362  0.401  0.142  1.000   -0.594
Diagonal  0.194 -0.503 -0.516 -0.623 -0.594    1.000


> pairs(banknote[banknote$Status=="genuine",-1], main="GENUINE")
  dev.new()
  pairs(banknote[banknote$Status=="counterfeit",-1], main="COUNTERFEIT")

  dev.off();dev.off()

> round(cor(banknote[banknote$Status=="counterfeit",-1]), 3)
         Length   Left  Right Bottom    Top Diagonal
Length    1.000  0.351  0.229 -0.252  0.087    0.059
Left      0.351  1.000  0.615 -0.083 -0.073   -0.035
Right     0.229  0.615  1.000 -0.055  0.001    0.206
Bottom   -0.252 -0.083 -0.055  1.000 -0.681    0.378
Top       0.087 -0.073  0.001 -0.681  1.000   -0.062
Diagonal  0.059 -0.035  0.206  0.378 -0.062    1.000

> round(cor(banknote[banknote$Status=="genuine",-1]), 3)
         Length   Left  Right Bottom    Top Diagonal
Length    1.000  0.411  0.416  0.229  0.057    0.032
Left      0.411  1.000  0.664  0.242  0.208   -0.265
Right     0.416  0.664  1.000  0.255  0.133   -0.150
Bottom    0.229  0.242  0.255  1.000 -0.632   -0.001
Top       0.057  0.208  0.133 -0.632  1.000   -0.260
Diagonal  0.032 -0.265 -0.150 -0.001 -0.260    1.000


## TO DISPLAY MANY PLOTS OF VARIABLES FROM ONE GROUP AGAINST ANOTHER:

> par(mfrow=c(2,3))
  for(j in 2:7) 
       plot(banknote[banknote$Status=="counterfeit",j],
            banknote[banknote$Status=="genuine",j],
            xlab="Counterfeit", ylab="Genuine",
            main=paste0("Variable: ",names(banknote)[j]))

## You have to examine the ranges n the axes carefully to get information from the plot.
## To be more useful, the range should be the same for both variables

> for(j in 2:7) 
       plot(banknote[banknote$Status=="counterfeit",j],
           banknote[banknote$Status=="genuine",j],
           xlab="Counterfeit", ylab="Genuine",
           xlim=range(banknote[,j]), ylim=range(banknote[,j]),              
           main=paste0("Variable: ",names(banknote)[j]))

## I still like the boxplots better for visual effect

> for(j in 2:7) boxplot(banknote[banknote$Status=="counterfeit",j],
           banknote[banknote$Status=="genuine",j],
           xlab=names(banknote)[j], main = paste0(names(banknote)[j]," BoxPlot"))

## Histograms are OK too if there are not too many of them:

> for(j in 2:7) {
  hist(banknote[,j], nclass=10, xlab="Right, mm", main= paste0(
    "Histogram for ",names(banknote)[j]),  density=0, border=NA)
  hist(banknote[banknote$Status=="counterfeit",j], nclass=10, col="cyan", 
       density=15, add=T)
  hist(banknote[banknote$Status=="genuine",j], nclass=10, col="orange", 
       density=15, angle = 135, add=T) }

## NB here we abandoned prob=T: otherwise it is a little difficult to get 
#     the vertical scaling right for all 6 variables.

> par(mfrow=c(1,1))              ### to restore plot-window to plot single graph


### Projections, in fixed or random pairs of directions:

  tmpmat = data.matrix(banknote[,-1]) %*% cbind( runitvec(6), runitvec(6))
  plot(tmpmat, xlab="random direction x", ylab="random direction y", type="n")
  points(tmpmat[banknote$Status=="genuine",], pch=5)
  points(tmpmat[banknote$Status=="counterfeit",], pch=18)

### Generally (not always) gives a very clear sense that the two clouds of points
#     can be separated perfectly by hyperplanes !!

  runitvec = function(ndim, seed=NULL)  {
       if(!is.null(seed)) set.seed(seed)
       x = rnorm(ndim)
       x/sqrt(sum(x*x))   }

  Ran.Proj = function(dat, set1=1:(nrow(dat)%/%2), seed=NULL) {
       if(!is.null(seed)) set.seed(seed)
       if(is.data.frame(dat)) dat = data.matrix(dat)
       k = ncol(dat)
       Pmat = cbind( runitvec(k), runitvec(k))
       tmpmat = dat %*% Pmat
       plot(tmpmat, xlab="random direction x", 
            ylab="random direction y", type="n")
       points(tmpmat[set1,], pch=5)
       points(tmpmat[setdiff(1:nrow(dat),set1),], pch=18)
       Pmat }

 tmp= Ran.Proj(banknote[,-1],1:100)

## do this several times, mostly we perfectly separate the two classes of currency!

  tmpmat = data.matrix(banknote[,c(4,6)])
  plot(tmpmat, xlab=names(banknote)[4], ylab=names(banknote)[6], type="n")
  points(tmpmat[banknote$Status=="genuine",], pch=5)
  points(tmpmat[banknote$Status=="counterfeit",], pch=18)

  tmpmat = data.matrix(banknote[,c(5,7)])
  plot(tmpmat, xlab=names(banknote)[5], ylab=names(banknote)[7], type="n")
  points(tmpmat[banknote$Status=="genuine",], pch=5)
  points(tmpmat[banknote$Status=="counterfeit",], pch=18)

## The random projections above were done without centering and scaling:

> apply(banknote[,-1],2,summary)
         Length     Left    Right  Bottom     Top Diagonal
Min.    213.800 129.0000 129.0000  7.2000  7.7000 137.8000
1st Qu. 214.600 129.9000 129.7000  8.2000 10.1000 139.5000
Median  214.900 130.2000 130.0000  9.1000 10.6000 140.4500
Mean    214.896 130.1215 129.9565  9.4175 10.6505 140.4835
3rd Qu. 215.100 130.4000 130.2250 10.6000 11.2000 141.5000
Max.    216.300 131.0000 131.1000 12.7000 12.3000 142.4000

## But that should not make too much difference in understanding separability
#    of the two point clouds by hyperplanes. (Why not ?)

# Check this by applying the random projections after first centering

> bankaux = data.matrix(banknote[,-1]) - rep(1,200) %o% apply(banknote[,-1],2,mean)

## and we could scale too if we wanted ... makes a little difference, not much.

##########################################################################

# Many other displays are possible by first replacing the data in each variable 
#    by ranks within the complete dataset:

> bankrank = apply(data.matrix(banknote[,-1]), 2, rank)

> bankrank[(1:10)*10,]
      Length  Left Right Bottom   Top Diagonal
 [1,]  162.5 149.5 160.0  103.0  41.5    103.5
 [2,]   60.5 110.0  84.5   73.5  41.5    181.0
 [3,]   81.0  31.0  11.5   60.0   4.5    187.5
 [4,]    2.5 129.5   1.0   47.5  23.0    129.0
 [5,]   26.5   1.0  41.0   24.0  29.0    187.5
 [6,]  145.0  77.0  71.5   53.5  68.0    136.0
 [7,]  104.0 110.0 141.5   41.0 152.5     57.0
 [8,]   43.5  59.0  22.0   31.0  41.5    173.0
 [9,]  124.5  59.0  56.0   41.0  83.5    187.5
[10,]   60.5  77.0  22.0   24.0  41.5    123.5

>  par(mfrow=c(2,3))
   for(j in 1:6) 
       plot(bankrank[banknote$Status=="counterfeit",j],
           bankrank[banknote$Status=="genuine",j],
           xlab="Counterfeit", ylab="Genuine",
           xlim=c(1,200), ylim=c(1,200),              
           main=paste0("Variable: ",names(banknote)[j]))

## Compare: "Pearson" correlations based on direct observations with 
#    those based on ranks ("Spearman method")

> round(cor(banknote[banknote$Status=="counterfeit",-1], method="spearman"),3)
         Length   Left  Right Bottom    Top Diagonal
Length    1.000  0.367  0.222 -0.254  0.133    0.153
Left      0.367  1.000  0.560 -0.086 -0.069    0.070
Right     0.222  0.560  1.000 -0.118 -0.001    0.262
Bottom   -0.254 -0.086 -0.118  1.000 -0.714    0.179
Top       0.133 -0.069 -0.001 -0.714  1.000   -0.045
Diagonal  0.153  0.070  0.262  0.179 -0.045    1.000
> round(cor(banknote[banknote$Status=="counterfeit",-1], method="pearson"),3)
         Length   Left  Right Bottom    Top Diagonal
Length    1.000  0.351  0.229 -0.252  0.087    0.059
Left      0.351  1.000  0.615 -0.083 -0.073   -0.035
Right     0.229  0.615  1.000 -0.055  0.001    0.206
Bottom   -0.252 -0.083 -0.055  1.000 -0.681    0.378
Top       0.087 -0.073  0.001 -0.681  1.000   -0.062
Diagonal  0.059 -0.035  0.206  0.378 -0.062    1.000