SOLUTION TO HW2, FALL 2017, STAT705     ### with edits by grader

based on dataset "nepali" in package "faraway"

## (a)  
# use function unique to extract unique values of id

        # use function length to count number of unique values
        
# There are 200 unique children in the data set.
> length(unique(nepali$id))
[1] 200

## (b) 
# The household id can be represented by the first 5 digits of variable id.
       
# Use function substr to extract the first 5 digits of childrens' id.
       
# Use unique to extract unique values of household id.
       
# Use function length to count the number of unique values.
> length(unique(substr(nepali$id,1,5)))
[1] 113
       # Hence, there are 113 unique households in the data set. 

## (c)  
# Based on the table below, there are no observations missing one but not both of the two fields.
> table(is.na(nepali$wt), is.na(nepali$ht))
       
        FALSE TRUE
  FALSE   877    0
  TRUE      0  123

## (d)  
# create a function to count number of missing values of a column

       # use function sapply to apply the function to each column of nepali
       
# The number of missing values are all equal to 0 except wt and ht. 
> sapply(nepali, function(col) sum(is.na(col)))
   id   sex    wt    ht  mage   lit  died alive   age 
    0     0   123   123     0     0     0     0     0 
     #### no other missing data
  
##(e)
> summary(diff(nepali$id))
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     0.0      0.0      0.0    400.5      0.0 239319.0 
        #### shows that the id's are in sorted increasing numerical order
> tmp = c(0,as.numeric(substr(nepali$id,1,5)))		
  newind = which(tmp[-1] > tmp[-1001])   ### 113 indices at which new households begin
  newind = c(newind,1001)
  HHdat = array(0, c(113,4))
  for (i in 1:113) HHdat[i,] = apply(data.matrix(nepali[newind[i]:(newind[i+1]-1),
            c("mage","lit","died","alive")]), 2, function(col) max(col)-min(col))
> sum(apply(HHdat,1,sum) > 0)
[1] 13
              ### so there IS some variation in the "mother" fields within HHdat

### ANOTHER SLICKER WAY TO DO THE SAME VERIFICATION

> HHdat2 = aggregate.data.frame(nepali[,c("mage","lit","died","alive")], 
         by=list(substr(nepali$id,1,5)), function(col) var(col)  )  
> sum(apply(data.matrix(HHdat2[,-1]),1,sum) > 0 )
[1] 13

> HHinds = which(apply(HHdat,1,sum)>0)
> HHdat[HHinds,]
      [,1] [,2] [,3] [,4]
 [1,]   28    0    0    6
 [2,]    8    0    0    1
 [3,]    9    0    0    2
 [4,]   17    0    2    6
 [5,]    3    0    0    1
 [6,]    0    0    0    1
 [7,]   10    0    3    5
 [8,]    2    0    0    0
 [9,]   15    0    0    6
[10,]    6    0    3    2
[11,]    2    0    0    1
[12,]    7    0    2    4
[13,]    7    1    0    1

#### So there are only 13 households for which mother data-field varies within HH
#### In all but one, the mother's age was different across some different records;
####   and in only one was the mother's age unchanged but a change in the "alive" field:

> nepali[substr(nepali$id,1,5)==HHdat2[HHinds[6],1],c("mage","lit","died","alive")]
    mage lit died alive
496   29   1    0     3
497   29   1    0     3
498   29   1    0     3
499   29   1    0     3
500   29   1    0     3
501   29   1    0     3
502   29   1    0     3
503   29   1    0     3
504   29   1    0     3
505   29   1    0     3
506   29   1    0     4
507   29   1    0     4
508   29   1    0     4
509   29   1    0     4
510   29   1    0     4
511   29   1    0     4
512   29   1    0     4
513   29   1    0     4
514   29   1    0     4
515   29   1    0     4
                           ### HH in which a new child was born 

### (f)  

> summary(nepali$mage[newind[-114]])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  17.00   24.00   27.00   27.88   32.00   42.00 					   
  
> hist(nepali$mage[newind[-114]])

#### Grader suggests another way to do (e)

> nepali.hh = split(x=nepali, f=substr(id, 1, 5))