SOLUTION TO HW2, FALL 2017, STAT705 ### with edits by grader based on dataset "nepali" in package "faraway" ## (a) # use function unique to extract unique values of id # use function length to count number of unique values # There are 200 unique children in the data set. > length(unique(nepali$id)) [1] 200 ## (b) # The household id can be represented by the first 5 digits of variable id. # Use function substr to extract the first 5 digits of childrens' id. # Use unique to extract unique values of household id. # Use function length to count the number of unique values. > length(unique(substr(nepali$id,1,5))) [1] 113 # Hence, there are 113 unique households in the data set. ## (c) # Based on the table below, there are no observations missing one but not both of the two fields. > table(is.na(nepali$wt), is.na(nepali$ht)) FALSE TRUE FALSE 877 0 TRUE 0 123 ## (d) # create a function to count number of missing values of a column # use function sapply to apply the function to each column of nepali # The number of missing values are all equal to 0 except wt and ht. > sapply(nepali, function(col) sum(is.na(col))) id sex wt ht mage lit died alive age 0 0 123 123 0 0 0 0 0 #### no other missing data ##(e) > summary(diff(nepali$id)) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 0.0 0.0 400.5 0.0 239319.0 #### shows that the id's are in sorted increasing numerical order > tmp = c(0,as.numeric(substr(nepali$id,1,5))) newind = which(tmp[-1] > tmp[-1001]) ### 113 indices at which new households begin newind = c(newind,1001) HHdat = array(0, c(113,4)) for (i in 1:113) HHdat[i,] = apply(data.matrix(nepali[newind[i]:(newind[i+1]-1), c("mage","lit","died","alive")]), 2, function(col) max(col)-min(col)) > sum(apply(HHdat,1,sum) > 0) [1] 13 ### so there IS some variation in the "mother" fields within HHdat ### ANOTHER SLICKER WAY TO DO THE SAME VERIFICATION > HHdat2 = aggregate.data.frame(nepali[,c("mage","lit","died","alive")], by=list(substr(nepali$id,1,5)), function(col) var(col) ) > sum(apply(data.matrix(HHdat2[,-1]),1,sum) > 0 ) [1] 13 > HHinds = which(apply(HHdat,1,sum)>0) > HHdat[HHinds,] [,1] [,2] [,3] [,4] [1,] 28 0 0 6 [2,] 8 0 0 1 [3,] 9 0 0 2 [4,] 17 0 2 6 [5,] 3 0 0 1 [6,] 0 0 0 1 [7,] 10 0 3 5 [8,] 2 0 0 0 [9,] 15 0 0 6 [10,] 6 0 3 2 [11,] 2 0 0 1 [12,] 7 0 2 4 [13,] 7 1 0 1 #### So there are only 13 households for which mother data-field varies within HH #### In all but one, the mother's age was different across some different records; #### and in only one was the mother's age unchanged but a change in the "alive" field: > nepali[substr(nepali$id,1,5)==HHdat2[HHinds[6],1],c("mage","lit","died","alive")] mage lit died alive 496 29 1 0 3 497 29 1 0 3 498 29 1 0 3 499 29 1 0 3 500 29 1 0 3 501 29 1 0 3 502 29 1 0 3 503 29 1 0 3 504 29 1 0 3 505 29 1 0 3 506 29 1 0 4 507 29 1 0 4 508 29 1 0 4 509 29 1 0 4 510 29 1 0 4 511 29 1 0 4 512 29 1 0 4 513 29 1 0 4 514 29 1 0 4 515 29 1 0 4 ### HH in which a new child was born ### (f) > summary(nepali$mage[newind[-114]]) Min. 1st Qu. Median Mean 3rd Qu. Max. 17.00 24.00 27.00 27.88 32.00 42.00 > hist(nepali$mage[newind[-114]]) #### Grader suggests another way to do (e) > nepali.hh = split(x=nepali, f=substr(id, 1, 5))