STAT770 12/7/2020 RANDOM FORESTS ILLUSTRATION taken from web-page by Cory Macklin: https://towardsdatascience.com/random-forest-in-r-f66adf80ec9 > library(randomForest) > hdata = read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", header=F) names(hdata) = c("age", "sex", "cp", "trestbps", "choi", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thai", "num") hdata$num[hdata$num > 1] = 1 ### indicator of heart disease, the variable of interest ## Change missing data to "normal" for thai; remove missing ca rows thai: (3 = normal; 6 = fixed defect; 7 = reversable defect) ca: number of major vessels (0–3) colored by flourosopy hdata = hdata[hdata$ca != "?",] hdata$thai[hdata$thai=="?"] = "3.0" hdata <- transform( hdata, age=as.integer(age), sex=as.factor(sex), cp=as.factor(cp), trestbps=as.integer(trestbps), choi=as.integer(choi), fbs=as.factor(fbs), restecg=as.factor(restecg), thalach=as.integer(thalach), exang=as.factor(exang), oldpeak=as.numeric(oldpeak), slope=as.factor(slope), thai= factor(thai), ca = factor(ca), num=as.factor(num) ) ### now 299 x 14 library(caTools) sample = sample.split(hdata$num, SplitRatio = .75) train = subset(hdata, sample == TRUE) ### 225 x 14 test = subset(hdata, sample == FALSE) ## 74 x 14 fitRF = randomForest(formula = num ~ ., data=train) fitRF ... Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 3 OOB estimate of error rate: 17.33% Confusion matrix: 0 1 class.error 0 106 15 0.1239669 ## true num=0 1 24 80 0.2307692 ## true num=1 > predH = predict(fitRF, newdata=test[,-14]) table(truH = test[,14], pred=predH) pred truH 0 1 0 31 9 1 6 28 > fitRF2 = randomForest(formula = num ~ ., data=train, ntree = 1200, sampsize = 150) Call: randomForest(formula = num ~ ., data = train, ntree = 1200, sampsize = 150) Type of random forest: classification Number of trees: 1200 No. of variables tried at each split: 3 OOB estimate of error rate: 17.78% Confusion matrix: 0 1 class.error 0 105 16 0.1322314 1 24 80 0.2307692 > table(truH = test[,14], pred=predict(fitRF, newdata=test[,-14]) pred truH 0 1 0 31 9 1 6 28 > fitLR = glm(num ~ . , data=train, family=binomial) > table(truH= train$num, pred= (fitLR$fitted >.5)) pred truH FALSE TRUE 0 112 9 1 16 88 > table(truH = test$num, pred = (predict(fitLR, newdata=test[,-14])>0.5)) Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : factor restecg has new levels 1 > table(train$restecg) 0 1 2 113 0 112 > table(test$restecg) 0 1 2 35 4 35 ### This variable was not useful, so do it again without restecg > fitLR = glm(num ~ . , data=train[,-7], family=binomial) > table(truH= train$num, pred= (fitLR$fitted >.5)) pred truH FALSE TRUE 0 112 9 1 16 88 > table(truH = test$num, pred = (predict(fitLR, newdata=test[,-c(7,14)])>0.5)) pred truH FALSE TRUE 0 36 4 1 5 29 ### So again in this example, it seems the logistic regression does as well as any > fitLR2 = update(fitLR, data=hdata[,-7]) > table(truH=hdata$num, pred=fitLR2$fitted > .5) pred truH FALSE TRUE 0 150 11 1 20 118 > fitRF3 = randomForest(formula = num ~ ., data=hdata, ntree = 2000, sampsize = 200) > fitRF3$confusion 0 1 class.error 0 139 22 0.1366460 1 28 110 0.2028986 > fitRF4 = randomForest(formula = num ~ ., data=hdata, ntree = 3000, sampsize = 220) > rpfit = rpart(num ~ ., data=hdata, method = "class", control=list(minbucket=20)) > table(true=hdata$num, rpfit$where) true 3 5 6 9 10 11 0 103 22 3 17 6 10 1 14 7 17 7 14 79 > table(true=hdata$num, pred=rpfit$where %in% c(6,10,11)) pred true FALSE TRUE 0 142 19 1 28 110 ### Brief discussion of Random Forests in slides in Lecture 27. ##------------------------------------------------------------------------------ ## Other examples: # (Return to Wisconsin Breast Cancer data set from UCI repository, # as in RscriptLec16.RLog) > WiscBrC = read.csv(paste0("http://archive.ics.uci.edu/ml/", "machine-learning-databases/breast-cancer-wisconsin", "/breast-cancer-wisconsin.data"), header=F) #### All variables categorical, ordinal #---------------------------------------------------------- Group 1: 367 instances (January 1989) Group 2: 70 instances (October 1989) Group 3: 31 instances (February 1990) Group 4: 17 instances (April 1990) Group 5: 48 instances (August 1990) Group 6: 49 instances (Updated January 1991) Group 7: 31 instances (June 1991) Group 8: 86 instances (November 1991) ----------------------------------------- Total: 699 points (as of the donated datbase on 15 July 1992) 1. Sample code number id number 2. Clump Thickness 1 - 10 3. Uniformity of Cell Size 1 - 10 4. Uniformity of Cell Shape 1 - 10 5. Marginal Adhesion 1 - 10 6. Single Epithelial Cell Size 1 - 10 7. Bare Nuclei 1 - 10 8. Bland Chromatin 1 - 10 9. Normal Nucleoli 1 - 10 10. Mitoses 1 - 10 11. Class: (2 for benign, 4 for malignant) #--------------------------------------------------------------- > names(WiscBrC) = c("ID","Thick","UnifSiz","UnifShap","MargAdhes", "EpithSiz","BareNucl","BlChrom","Nucleoli","Mitoses","Class") > table(WiscBrC$Mitoses) 1 2 3 4 5 6 7 8 10 579 35 33 12 6 3 9 8 14 #### All variables categorical, ordinal > table(WiscBrC$Class) 2 4 458 241 > WiscBrC$Class = (WiscBrC$Class-2)/2 gpsiz = c(367,70,31,17,48,49,31,86) WiscBrC = cbind.data.frame(WiscBrC, Group= factor(rep(1:8,gpsiz)), seqno = 1:699 - rep(cumsum(c(0,gpsiz[-8])),gpsiz)) ## Goal is prediction of Malignancy from the other data ## Can also try to use seq no within group ID Thick UnifSiz UnifShap MargAdhes EpithSiz BareNucl BlChrom Nucleoli "integer" "integer" "integer" "integer" "integer" "integer" "numeric" "integer" "integer" Mitoses Class Group seqno "integer" "numeric" "factor" "numeric" > sapply(WiscBrC, function(col) sum(is.na(col))) ### all 0 except BareNucl missing 16, initially ignored > WiscBrC$BareNucl = as.numeric(as.character(WiscBrC$BareNucl)) > Mod0 = glm(Class ~ 1, data=WiscBrC[,-1], subset=!is.na(BareNucl), family=binomial) > ModD = step(Mod0, scope=list(lower=formula(Class~1), upper= formula(Class ~ (Thick + UnifSiz + UnifShap + MargAdhes + EpithSiz + BareNucl + BlChrom + Nucleoli + Mitoses + Group + seqno)^2)), direction="both", k=4) > table(true=WiscBrC$Class[!is.na(WiscBrC$BareNucl)], pred=ModD$fit > 0.5) pred true FALSE TRUE 0 433 11 1 8 231 ### very little misclassification from selected logistic regression > anova(ModD) Analysis of Deviance Table Df Deviance Resid. Df Resid. Dev NULL 682 884.35 UnifSiz 1 629.59 681 254.76 BareNucl 1 88.45 680 166.31 Thick 1 30.76 679 135.56 BlChrom 1 12.28 678 123.28 MargAdhes 1 8.68 677 114.60 UnifSiz:BareNucl 1 9.37 676 105.23 Thick:MargAdhes 1 6.98 675 98.25 UnifSiz:MargAdhes 1 4.11 674 94.14 ##---------------------------- Now try decision trees -------------- > WBRtree = rpart(Class ~ ., data=WiscBrC, method = "class", control=list(minbucket=20)) n= 699 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 699 241 0 (0.65522175 0.34477825) 2) UnifSiz< 2.5 429 12 0 (0.97202797 0.02797203) * 3) UnifSiz>=2.5 270 41 1 (0.15185185 0.84814815) 6) UnifShap< 2.5 23 5 0 (0.78260870 0.21739130) * 7) UnifShap>=2.5 247 23 1 (0.09311741 0.90688259) * > table(WiscBrC$Class, WBRtree$where) 2 4 5 0 417 18 23 1 12 5 224 ### very similar classification with just 3 nodes!! ### note that these nodes do not use UnifShap > round(rbind(count=table(WiscBrC$UnifShap), frac.Malign=aggregate(WiscBrC$Class, by=list(WiscBrC$UnifShap), mean)$x),3) 1 2 3 4 5 6 7 8 9 10 count 353.000 59.000 56.000 44.000 34.000 30.0 30.000 28.000 7 58 frac.Malign 0.006 0.119 0.411 0.705 0.912 0.9 0.933 0.964 1 1 > table(WiscBrC$Class[is.na(WiscBrC$BareNucl)]) 0 1 14 2 ### omit the BareNucl variable in the Random Forest fit > WiscBrC$Class = factor(WiscBrC$Class) > WBrRF1 = randomForest(formula = Class ~ ., data=WiscBrC[,-7], ntree = 5000, sampsize = 400) > WBrRF1$confusion 0 1 class.error 0 443 15 0.03275109 1 7 234 0.02904564 ### again better than rpart but not logistic! > WBrRF1$importance MeanDecreaseGini ### average over many graphs ID 3.904000 Thick 14.255952 UnifSiz 42.394457 UnifShap 37.848168 MargAdhes 10.025784 EpithSiz 19.077996 BlChrom 25.700934 Nucleoli 18.380379 Mitoses 2.430750 Group 1.971491 seqno 4.103469 ### SO IN THIS EXAMPLE, WE ALSO SEE THAT THE WAY THE STEPWISE LOGISTIC-REGRESSION ## MODEL SELECTION WAS DONE, WE MISSED AN IMPORTANT INTERACTION, UnifShap and UnifSiz, ## THAT THE GRAPHICAL METHODS PICKED UP. BUT IT WAS "IMPORTANT" ONLY IN THE ABSENCE ## OF MANY OTHER VARIABLES, e.g. in the stripped-down decision tree found by rpart. > modF = update(ModD, formula = .~. + UnifShap*UnifSiz) > c(logLik(ModF), logLik(ModD)) [1] -45.48548 -47.06909 ## IN LIKELIHOOD OR LRT TERMS, THIS ADDITIONAL INTERACTION MAKES LITTLE DIFFERENCE! ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ## Final Example from UCI Repository -- Digital Character recognition Preprocessed data from NIST 1998, 64 attributes per record plus class (from 32x32 pixel images) All input attributes are integers in the range 0..16. The last attribute is the class code 0..9 > digi.test = read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes", header=F) ### 1797 x 65 digi.trn = read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra", header=F) ### 3823 x 65 > names(digi.test)[65] = names(digi.trn)[65] = "class" > digi.test$class = factor(ifelse(digi.test$class==0,1,0)) ### consider only "0" vs "other" digi.trn$class = factor(ifelse(digi.trn$class==0,1,0)) > maxlev = sapply(rbind.data.frame(digi.test,digi.trn)[,-65], max) for(i in 1:64) { digi.test[,i] = factor(digi.test[,i], levels=0:maxlev[i]) digi.trn[,i] = factor(digi.trn[,i], levels=0:maxlev[i]) } > library(rpart) DigFit0 = rpart(class ~ ., data=digi.trn, method = "class", control=list(minbucket=50)) DigFit0 n= 3823 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 3823 376 0 (0.901647920 0.098352080) 2) V37=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 3276 24 0 (0.992673993 0.007326007) * 3) V37=0 547 195 1 (0.356489945 0.643510055) 6) V34=0,1 191 23 0 (0.879581152 0.120418848) * 7) V34=2,3,4,5,6,7,8,9,10,12 356 27 1 (0.075842697 0.924157303) * > table(class=digi.trn$class, DigFit0$where) pred class 0 1 0 3420 27 1 47 329 > table(class=digi.test$class, pred=ifelse( predict(DigFit0,digi.test[,-65])[,2]>0.5,1,0)) pred class 0 1 0 1596 23 1 8 170 > library(randomForest) DigFit1 = randomForest(formula = class ~ ., data=digi.trn, ntree = 5000, sampsize = 1000, replace=F) DigFit1$confusion 0 1 class.error 0 3447 0 0.00000000 ### no mistakes when character is not "0" !! 1 15 361 0.03989362 ## 4% misclassification when char =0 table(true=digi.test$class, pred=ifelse( predict(DigFit1,digi.test[,-65])==1,1,0)) pred true 0 1 0 1618 1 1 7 171 ## performance almost as good on test set !! DigFit2 = randomForest(formula = class ~ ., data=digi.trn, ntree = 5000, replace=F) ### default sampsiz=.632*3823 > DigFit2$confusion 0 1 class.error 0 3446 1 0.0002901073 1 11 365 0.0292553191 table(true=digi.test$class, pred=ifelse( predict(DigFit2,digi.test[,-65])==1,1,0)) pred true 0 1 0 1619 0 1 3 175 ### this version seems definitely better !! > ind = order(DigFit2$importance)[64:55] cbind.data.frame(varbl=paste0("V",ind),MeanDecrGini=DigFit2$importance[ind]) varbl MeanDecrGini 1 V37 72.49697 2 V29 46.49720 3 V31 34.95095 4 V36 31.15619 5 V34 23.75331 6 V39 21.30727 7 V43 20.67002 8 V28 19.92173 9 V22 14.80171 10 V42 12.99494 ### much more information about useful variables ### than the sketchy rpart tree. ## The $forest output contains full information on all trees grown > sapply(DigFit2$forest, class) ndbigtree nodestatus bestvar treemap nodepred xbestsplit pid cutoff "integer" "matrix" "matrix" "array" "matrix" "matrix" "numeric" "numeric" ncat maxcat nrnodes ntree nclass xlevels "integer" "integer" "integer" "numeric" "integer" "list" > sapply(DigFit2$forest, length) ndbigtree nodestatus bestvar treemap nodepred xbestsplit pid cutoff 5000 605000 605000 1210000 605000 605000 2 2 ncat maxcat nrnodes ntree nclass xlevels 64 1 1 1 1 64 > unlist(DigFit2$forest[10:13]) maxcat nrnodes ntree nclass 17 121 5000 2 > dim(DigFit2$forest$nodestatus) [1] 121 5000 ### 121 different nodes, info kept for each tree > table(c(DigFit2$forest$nodestatus)) -1 0 1 176824 256352 171824 ### total 121*5000 ### How to look at the saved trees? > num2binary = function(intgr) { xpnd=NULL while(intgr) { dgt = intgr %% 2 xpnd= c(dgt,xpnd) intgr = intgr %/% 2 } xpnd } > tmptree = getTree(DigFit2, 5) > class(tmptree) [1] "matrix" > dim(tmptree) [1] 65 6 ### 65 nodes, columns are [1] "left daughter" "right daughter" "split var" "split point" "status" [6] "prediction" > tmptree[1:10,] left daughter right daughter split var split point status prediction 1 2 3 30 16368 1 0 2 4 5 37 1 1 0 3 6 7 31 504 1 0 4 8 9 34 65532 1 0 5 10 11 35 12544 1 0 6 12 13 42 2036 1 0 7 14 15 47 10256 1 0 8 16 17 50 105983 1 0 9 18 19 59 94222 1 0 10 20 21 31 110064 1 0 ## split points are integers to be interpreted as binary expansions, with 1's indicating ## categories of split variable that get sent to the left, eg > levels(digi.trn$V30) [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" > num2binary(16368) [1] 1 1 1 1 1 1 1 1 1 1 0 0 0 0 ## think of this as binary sequence with 17 places ## so the categories "4"-"13" [which count from right to left] ## are sent to left daughter, others to right > levels(digi.trn$V37) [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" ## Now the split-point 1 for the second node means that category "0" gets sent left ## and all others get sent right ## AND THIS IS THE SAME SPLIT ON V37 THAT APPEARED IN THE rpart TREE ## Similarly the split-point 65532 in V34 IMPLY THAT CATEGORIES "0","1" move to right ## daughter and all others to left daughter, as indicated in the rpart tree split on V34 > num2binary(65532) [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 > levels(digi.trn$V34) [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" #--- ## "status" values are 1, indicating that the node is split, or -1, indicating terminal node ## These status values apply also in the DigFit2$forest$nodestatus matrix , ## with node status 0 saying indicates that these nodes do not exist > DigFit2$forest$nodestatus[,5] [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -1 1 -1 1 1 1 -1 1 -1 1 -1 1 -1 1 [29] -1 1 1 -1 -1 1 -1 -1 -1 -1 -1 -1 1 -1 1 1 -1 1 -1 1 -1 -1 -1 -1 -1 1 -1 -1 [57] -1 -1 1 -1 -1 -1 -1 -1 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [85] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [113] 0 0 0 0 0 0 0 0 0 > summary(c(DigFit2$forest$xbestsplit)) Min. 1st Qu. Median Mean 3rd Qu. Max. 0 0 0 18084 99 131070 ### The xbestsplit numbers are the "split point" numbers telling which characteristics ## of the current split variable move to left and right daughters > DigFit2$forest$xbestsplit[1:10,5] [1] 16368 1 504 65532 12544 2036 10256 105983 94222 110064