Homework Problem 3 Solution 9/13/2017 ## (a) > framfix = function(infram) { # locate the index of any character columns charcols = which( sapply(infram, is.character) ) # Recode NA to "Mis" for missing values in character columns for (i in charcols) infram[ infram[,i]=="" ,i] = "Mis" # locate the columns named "age" or "Age" agecol = which( names(infram) %in% c("age","Age") ) # use ceiling function to determine k, recode age based on rule if(length(agecol)>0) for (i in agecol) infram[,i] = ifelse(infram[,i]>0, 5*ceiling(infram[,i]/5)-2.5, 0) # use "which" function to locate columns with >30% missing values manymiss = which( sapply(infram, function(col) mean(is.na(col)) > 0.3 ) ) # if there are any such columns, delete them. if(length(manymiss>0)) newfram = infram[, -manymiss] else newfram=infram # determine the index of any numeric column numcol = which( sapply(newfram, is.numeric) ) # apply "quantile" with argument "probs" to each numeric column Quant = apply( data.matrix(newfram[,numcol]), 2, quantile, probs=c(.1,.25,.5,.75,.9), na.rm=T) # determine the index of any logical column logcol = which( sapply(newfram, is.logical) ) # convert boolean values to numeric values in each logical column for (i in logcol) newfram[,i] = as.numeric(newfram[,i]) numcol = which( sapply(newfram, is.numeric) ) tmp = data.matrix(newfram[,numcol]) list( newfram = newfram, Quant=Quant, # indices of rows containing missing values Rows = as.numeric(which(apply(tmp,1, function(row) sum(is.na(row))>0)) ), # names of columns containing missing values Cols = names(newfram)[numcol][ apply(tmp,2, function(col) sum(is.na(col))>0) ] ) } ## (b) # (b) Part 1 > tmp = framfix(nepali) > names(tmp) [1] "newfram" "Quant" "Rows" "Cols" > tmp[[3]] [1] 5 10 19 25 29 82 84 93 151 167 168 169 170 172 173 174 175 182 [19] 183 193 199 280 304 305 309 310 331 429 462 463 464 465 474 475 484 497 [37] 498 499 500 502 503 504 505 506 508 511 513 520 522 524 525 527 529 530 [55] 535 541 542 543 544 545 580 599 618 620 625 630 661 662 663 664 665 666 [73] 667 668 669 670 693 707 720 725 737 740 752 767 768 769 770 772 792 802 [91] 806 807 808 809 847 850 852 855 862 870 876 878 880 886 890 891 895 896 [109] 899 900 916 935 936 941 945 965 968 969 970 985 988 990 997 > tmp[[2]] id sex wt ht mage lit died alive age 10% 120131.9 1 7.4 69.56 20.9 0 0 1 12.5 25% 120368.8 1 9.0 76.40 24.0 0 0 2 22.5 50% 360111.5 1 11.1 84.80 28.0 0 0 4 37.5 75% 360591.2 2 13.2 94.30 34.0 0 1 6 52.5 90% 360923.0 2 15.2 101.70 35.0 0 2 8 62.5 > table(tmp$newfram[,"age"]) 0 2.5 7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 1 23 40 68 96 81 92 76 76 73 74 83 78 68 49 20 77.5 2 # (b) Part 2 > Titanic3 = read.csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv", stringsAsFactors=F) > tmp[[4]] [1] "age" "fare" > length(tmp$Rows) [1] 264 > tmp$Quant pclass survived age sibsp parch fare 10% 1 0 12.5 0 0 7.56750 25% 2 0 22.5 0 0 7.89580 50% 3 0 27.5 0 0 14.45420 75% 3 1 37.5 1 0 31.27500 90% 3 1 47.5 1 2 78.05082 > table(tmp$newfram[,"age"]) 2.5 7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5 56 30 29 133 195 166 115 95 72 60 36 26 23 4 4 2