STAT770                                                       12/7/2020

RANDOM FORESTS ILLUSTRATION  taken from web-page by Cory Macklin:
    https://towardsdatascience.com/random-forest-in-r-f66adf80ec9

> library(randomForest)

> hdata = read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", header=F)

  names(hdata) = c("age", "sex", "cp", "trestbps", "choi", "fbs", "restecg", 
         "thalach", "exang", "oldpeak", "slope", "ca", "thai", "num")

  hdata$num[hdata$num > 1] = 1   ### indicator of heart disease, the variable of interest

## Change missing data to "normal" for thai; remove missing ca rows

    thai: (3 = normal; 6 = fixed defect; 7 = reversable defect)
    ca: number of major vessels (0–3) colored by flourosopy

  hdata = hdata[hdata$ca != "?",]
  hdata$thai[hdata$thai=="?"] = "3.0"

  hdata <- transform( hdata,
    age=as.integer(age),
    sex=as.factor(sex),
    cp=as.factor(cp),
    trestbps=as.integer(trestbps),
    choi=as.integer(choi),
    fbs=as.factor(fbs),
    restecg=as.factor(restecg),
    thalach=as.integer(thalach),
    exang=as.factor(exang),
    oldpeak=as.numeric(oldpeak),
    slope=as.factor(slope),
    thai= factor(thai),
    ca = factor(ca),
    num=as.factor(num)    )          ### now 299 x 14

 library(caTools)
 sample = sample.split(hdata$num, SplitRatio = .75)

 train = subset(hdata, sample == TRUE)                 ### 225 x 14
 test  = subset(hdata, sample == FALSE)                ##   74 x 14

  fitRF = randomForest(formula = num ~ ., data=train)
  fitRF
...
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 17.33%
Confusion matrix:
    0  1 class.error
0 106 15   0.1239669   ## true num=0
1  24 80   0.2307692   ## true num=1

> predH = predict(fitRF, newdata=test[,-14])
  table(truH = test[,14], pred=predH)
    pred
truH  0  1
   0 31  9
   1  6 28

> fitRF2 = randomForest(formula = num ~ ., data=train, 
         ntree = 1200, sampsize = 150)

Call:
 randomForest(formula = num ~ ., data = train, ntree = 1200, sampsize = 150) 
               Type of random forest: classification
                     Number of trees: 1200
No. of variables tried at each split: 3

        OOB estimate of  error rate: 17.78%
Confusion matrix:
    0  1 class.error
0 105 16   0.1322314
1  24 80   0.2307692

> table(truH = test[,14], pred=predict(fitRF, newdata=test[,-14])
    pred
truH  0  1
   0 31  9
   1  6 28

> fitLR = glm(num ~ . , data=train, family=binomial)
> table(truH= train$num, pred= (fitLR$fitted >.5))

    pred
truH FALSE TRUE
   0   112    9
   1    16   88

>  table(truH = test$num, pred = (predict(fitLR, newdata=test[,-14])>0.5))
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : 
  factor restecg has new levels 1

> table(train$restecg)
  0   1   2 
113   0 112 

> table(test$restecg)
 0  1  2 
35  4 35 

### This variable was not useful, so do it again without restecg

> fitLR = glm(num ~ . , data=train[,-7], family=binomial)
> table(truH= train$num, pred= (fitLR$fitted >.5))
    pred
truH FALSE TRUE
   0   112    9
   1    16   88

> table(truH = test$num, pred = (predict(fitLR, newdata=test[,-c(7,14)])>0.5))
    pred
truH FALSE TRUE
   0    36    4
   1     5   29

### So again in this example, it seems the logistic regression does as well as any
> fitLR2 = update(fitLR, data=hdata[,-7])
> table(truH=hdata$num, pred=fitLR2$fitted > .5)
    pred
truH FALSE TRUE
   0   150   11
   1    20  118

> fitRF3 = randomForest(formula = num ~ ., data=hdata, 
         ntree = 2000, sampsize = 200)
> fitRF3$confusion
    0   1 class.error
0 139  22   0.1366460
1  28 110   0.2028986

> fitRF4 = randomForest(formula = num ~ ., data=hdata, 
         ntree = 3000, sampsize = 220)

> rpfit = rpart(num ~ ., data=hdata, 
     method = "class", control=list(minbucket=20))
> table(true=hdata$num, rpfit$where)
    
true   3   5   6   9  10  11
   0 103  22   3  17   6  10
   1  14   7  17   7  14  79
> table(true=hdata$num, pred=rpfit$where %in% c(6,10,11))
    pred
true FALSE TRUE
   0   142   19
   1    28  110


### Brief discussion of Random Forests in slides in Lecture 27.

##------------------------------------------------------------------------------
## Other examples:
# (Return to Wisconsin Breast Cancer data set from UCI repository, 
#     as in RscriptLec16.RLog)

> WiscBrC = read.csv(paste0("http://archive.ics.uci.edu/ml/",
      "machine-learning-databases/breast-cancer-wisconsin",
      "/breast-cancer-wisconsin.data"), header=F)
                      ####  All variables categorical, ordinal
#----------------------------------------------------------
     Group 1: 367 instances (January 1989)
     Group 2:  70 instances (October 1989)
     Group 3:  31 instances (February 1990)
     Group 4:  17 instances (April 1990)
     Group 5:  48 instances (August 1990)
     Group 6:  49 instances (Updated January 1991)
     Group 7:  31 instances (June 1991)
     Group 8:  86 instances (November 1991)
     -----------------------------------------
     Total:   699 points (as of the donated datbase on 15 July 1992)

 1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)
#---------------------------------------------------------------


> names(WiscBrC) = c("ID","Thick","UnifSiz","UnifShap","MargAdhes",
    "EpithSiz","BareNucl","BlChrom","Nucleoli","Mitoses","Class")

> table(WiscBrC$Mitoses)
  1   2   3   4   5   6   7   8  10 
579  35  33  12   6   3   9   8  14 
     ####  All variables categorical, ordinal

> table(WiscBrC$Class)
  2   4 
458 241 

> WiscBrC$Class = (WiscBrC$Class-2)/2
  gpsiz = c(367,70,31,17,48,49,31,86)
  WiscBrC = cbind.data.frame(WiscBrC, Group=
          factor(rep(1:8,gpsiz)), seqno = 1:699 - rep(cumsum(c(0,gpsiz[-8])),gpsiz))
   
## Goal is prediction of Malignancy from the other data
## Can also try to use seq no within group
       ID     Thick   UnifSiz  UnifShap MargAdhes  EpithSiz  BareNucl   BlChrom  Nucleoli 
"integer" "integer" "integer" "integer" "integer" "integer" "numeric" "integer" "integer" 
  Mitoses     Class     Group     seqno 
"integer" "numeric"  "factor" "numeric" 

> sapply(WiscBrC, function(col) sum(is.na(col)))
### all 0 except BareNucl missing 16,  initially ignored

> WiscBrC$BareNucl = as.numeric(as.character(WiscBrC$BareNucl))
> Mod0 = glm(Class ~ 1, data=WiscBrC[,-1], 
           subset=!is.na(BareNucl), family=binomial)  
> ModD = step(Mod0, scope=list(lower=formula(Class~1), 
      upper= formula(Class ~ (Thick + UnifSiz + 
      UnifShap + MargAdhes + EpithSiz + BareNucl + 
       BlChrom + Nucleoli + Mitoses + Group + seqno)^2)), 
       direction="both", k=4)
> table(true=WiscBrC$Class[!is.na(WiscBrC$BareNucl)], pred=ModD$fit > 0.5)       
    pred
true FALSE TRUE
   0   433   11
   1     8  231    ### very little misclassification from selected logistic regression

> anova(ModD)
Analysis of Deviance Table

                  Df Deviance Resid. Df Resid. Dev
NULL                                682     884.35
UnifSiz            1   629.59       681     254.76
BareNucl           1    88.45       680     166.31
Thick              1    30.76       679     135.56
BlChrom            1    12.28       678     123.28
MargAdhes          1     8.68       677     114.60
UnifSiz:BareNucl   1     9.37       676     105.23
Thick:MargAdhes    1     6.98       675      98.25
UnifSiz:MargAdhes  1     4.11       674      94.14

##---------------------------- Now try decision trees --------------
> WBRtree = rpart(Class ~ ., data=WiscBrC, 
         method = "class", control=list(minbucket=20))
n= 699 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 699 241 0 (0.65522175 0.34477825)  
  2) UnifSiz< 2.5 429  12 0 (0.97202797 0.02797203) *
  3) UnifSiz>=2.5 270  41 1 (0.15185185 0.84814815)  
    6) UnifShap< 2.5 23   5 0 (0.78260870 0.21739130) *
    7) UnifShap>=2.5 247  23 1 (0.09311741 0.90688259) *

> table(WiscBrC$Class, WBRtree$where)
      2   4   5
  0 417  18  23
  1  12   5 224      ### very similar classification with just 3 nodes!!
                  ### note that these nodes do not use UnifShap

> round(rbind(count=table(WiscBrC$UnifShap), 
    frac.Malign=aggregate(WiscBrC$Class, 
       by=list(WiscBrC$UnifShap), mean)$x),3)
                  1      2      3      4      5    6      7      8 9 10
count       353.000 59.000 56.000 44.000 34.000 30.0 30.000 28.000 7 58
frac.Malign   0.006  0.119  0.411  0.705  0.912  0.9  0.933  0.964 1  1


> table(WiscBrC$Class[is.na(WiscBrC$BareNucl)])
 0  1 
14  2          ### omit the BareNucl variable in the Random Forest fit

> WiscBrC$Class = factor(WiscBrC$Class)

> WBrRF1 = randomForest(formula = Class ~ ., data=WiscBrC[,-7], 
         ntree = 5000, sampsize = 400)

> WBrRF1$confusion
    0   1 class.error
0 443  15  0.03275109
1   7 234  0.02904564      ### again better than  rpart but not logistic!

> WBrRF1$importance
          MeanDecreaseGini        ### average over many graphs
ID                3.904000
Thick            14.255952
UnifSiz          42.394457
UnifShap         37.848168
MargAdhes        10.025784
EpithSiz         19.077996
BlChrom          25.700934
Nucleoli         18.380379
Mitoses           2.430750
Group             1.971491
seqno             4.103469


### SO IN THIS EXAMPLE, WE ALSO SEE THAT THE WAY THE STEPWISE LOGISTIC-REGRESSION 
## MODEL SELECTION WAS DONE, WE MISSED AN IMPORTANT INTERACTION, UnifShap and UnifSiz,
## THAT THE GRAPHICAL METHODS PICKED UP. BUT IT WAS "IMPORTANT" ONLY IN THE ABSENCE 
## OF MANY OTHER VARIABLES, e.g. in the stripped-down decision tree found by rpart.

> modF = update(ModD, formula = .~. + UnifShap*UnifSiz)
> c(logLik(ModF), logLik(ModD))
[1] -45.48548 -47.06909

## IN LIKELIHOOD OR LRT TERMS, THIS ADDITIONAL INTERACTION MAKES LITTLE DIFFERENCE!

##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
## Final Example from UCI Repository -- Digital Character recognition

Preprocessed data from NIST 1998, 64 attributes per record plus class (from 32x32 pixel images)

All input attributes are integers in the range 0..16.
The last attribute is the class code 0..9

> digi.test = read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes", header=F)     ### 1797 x 65
  digi.trn = read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra", header=F)     ### 3823 x 65
> names(digi.test)[65] = names(digi.trn)[65] = "class"

> digi.test$class = factor(ifelse(digi.test$class==0,1,0))  ### consider only "0" vs "other"
  digi.trn$class = factor(ifelse(digi.trn$class==0,1,0))
> maxlev = sapply(rbind.data.frame(digi.test,digi.trn)[,-65], max)
  for(i in 1:64) {
          digi.test[,i] = factor(digi.test[,i], levels=0:maxlev[i])
          digi.trn[,i] = factor(digi.trn[,i], levels=0:maxlev[i])  }

> library(rpart)
  DigFit0 = rpart(class ~ ., data=digi.trn, 
         method = "class", control=list(minbucket=50))
 DigFit0
n= 3823 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

  1) root 3823 376 0 (0.901647920 0.098352080)  
  2) V37=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 3276  24 0 (0.992673993 0.007326007) *
  3) V37=0 547 195 1 (0.356489945 0.643510055)  
    6) V34=0,1 191  23 0 (0.879581152 0.120418848) *
    7) V34=2,3,4,5,6,7,8,9,10,12 356  27 1 (0.075842697 0.924157303) *

> table(class=digi.trn$class, DigFit0$where)
     pred
class    0    1
    0 3420   27
    1   47  329

> table(class=digi.test$class, pred=ifelse(
              predict(DigFit0,digi.test[,-65])[,2]>0.5,1,0))
     pred
class    0    1
    0 1596   23
    1    8  170

> library(randomForest) 
  DigFit1 = randomForest(formula = class ~ ., data=digi.trn, 
         ntree = 5000, sampsize = 1000, replace=F)
  DigFit1$confusion
     0   1 class.error
0 3447   0  0.00000000      ### no mistakes when character is not "0" !!
1   15 361  0.03989362      ## 4% misclassification when char =0

  table(true=digi.test$class, pred=ifelse(
              predict(DigFit1,digi.test[,-65])==1,1,0))
    pred
true    0    1
   0 1618    1
   1    7  171            ## performance almost as good on test set !!

  DigFit2 = randomForest(formula = class ~ ., data=digi.trn, 
         ntree = 5000, replace=F)          ### default sampsiz=.632*3823
 >  DigFit2$confusion
     0   1  class.error
0 3446   1 0.0002901073
1   11 365 0.0292553191
  table(true=digi.test$class, pred=ifelse(
              predict(DigFit2,digi.test[,-65])==1,1,0))
    pred
true    0    1
   0 1619    0
   1    3  175           ### this version seems definitely better !!

> ind = order(DigFit2$importance)[64:55]
  cbind.data.frame(varbl=paste0("V",ind),MeanDecrGini=DigFit2$importance[ind])
   varbl MeanDecrGini
1    V37     72.49697
2    V29     46.49720
3    V31     34.95095
4    V36     31.15619
5    V34     23.75331
6    V39     21.30727
7    V43     20.67002
8    V28     19.92173
9    V22     14.80171
10   V42     12.99494       ### much more information about useful variables
                         ### than the sketchy rpart tree.  

## The   $forest output contains full information on all trees grown

> sapply(DigFit2$forest, class)
 ndbigtree nodestatus    bestvar    treemap   nodepred xbestsplit        pid     cutoff 
 "integer"   "matrix"   "matrix"    "array"   "matrix"   "matrix"  "numeric"  "numeric" 
      ncat     maxcat    nrnodes      ntree     nclass    xlevels 
 "integer"  "integer"  "integer"  "numeric"  "integer"     "list" 

> sapply(DigFit2$forest, length)
 ndbigtree nodestatus    bestvar    treemap   nodepred xbestsplit        pid     cutoff 
      5000     605000     605000    1210000     605000     605000          2          2 
      ncat     maxcat    nrnodes      ntree     nclass    xlevels 
        64          1          1          1          1         64 

> unlist(DigFit2$forest[10:13])
 maxcat nrnodes   ntree  nclass 
     17     121    5000       2 

> dim(DigFit2$forest$nodestatus)
[1]  121 5000                           ### 121 different nodes, info kept for each tree

> table(c(DigFit2$forest$nodestatus))
    -1      0      1 
176824 256352 171824           ### total 121*5000

### How to look at the saved trees?

> num2binary = function(intgr) {
      xpnd=NULL
      while(intgr) {           
           dgt = intgr %% 2
           xpnd= c(dgt,xpnd)
           intgr = intgr %/% 2 }
      xpnd }

>  tmptree = getTree(DigFit2, 5)

> class(tmptree)
[1] "matrix"
> dim(tmptree)
[1] 65  6   ### 65 nodes, columns are

[1] "left daughter"  "right daughter" "split var"      "split point"    "status"        
[6] "prediction"    

> tmptree[1:10,]
   left daughter right daughter split var split point status prediction
1              2              3        30       16368      1          0
2              4              5        37           1      1          0
3              6              7        31         504      1          0
4              8              9        34       65532      1          0
5             10             11        35       12544      1          0
6             12             13        42        2036      1          0
7             14             15        47       10256      1          0
8             16             17        50      105983      1          0
9             18             19        59       94222      1          0
10            20             21        31      110064      1          0

## split points are integers to be interpreted as binary expansions, with 1's indicating
##   categories of split variable that get sent to the left, eg

> levels(digi.trn$V30)
 [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16"
> num2binary(16368)
 [1] 1 1 1 1 1 1 1 1 1 1 0 0 0 0  ## think of this as binary sequence with 17 places

## so the categories "4"-"13" [which count from right to left] 
##   are sent to left daughter, others to right

> levels(digi.trn$V37)
 [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16"
## Now the split-point 1 for the second node means that category "0" gets sent left 
##    and all others get sent right
## AND THIS IS THE SAME SPLIT ON V37 THAT APPEARED IN THE rpart TREE

## Similarly the split-point 65532 in V34 IMPLY THAT CATEGORIES "0","1" move to right 
##    daughter and all others to left daughter, as indicated in the rpart tree split on V34

> num2binary(65532)
 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
> levels(digi.trn$V34)
 [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"

#---

## "status" values are 1, indicating that the node is split, or -1, indicating terminal node

## These status values apply also in the   DigFit2$forest$nodestatus   matrix , 
##  with node status 0  saying indicates that these nodes do not exist

> DigFit2$forest$nodestatus[,5]
  [1]  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1  1  1  1 -1  1 -1  1 -1  1 -1  1
 [29] -1  1  1 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1  1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1 -1 -1
 [57] -1 -1  1 -1 -1 -1 -1 -1 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 [85]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
[113]  0  0  0  0  0  0  0  0  0

> summary(c(DigFit2$forest$xbestsplit))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       0       0   18084      99  131070 
### The xbestsplit numbers are the "split point" numbers telling which characteristics 
##   of the current split variable move to left and right daughters

> DigFit2$forest$xbestsplit[1:10,5]
 [1]  16368      1    504  65532  12544   2036  10256 105983  94222 110064