Solution to Homework 15, March 18, 2008.
=======================================

## First we code and test a little  function to do a 
##  single optimized gradient step. The step-length 
##  multiplies a unit vector in the directriuon of 
##  the gradient vector. The univariate step-length 
##  optimization is done with the R function "optimize",
##  and the tolerance for that function is set with 
##  the parameter "opttol".

GradStep = function(ffcn, fprim, xin, step=1, tol=1e-5) {
## Here step is the maximum multiple of the 
##    unit vector in gradient direction
    newv = fprim(xin)
    norm = sqrt(sum(newv^2)) 
    unitnew= newv/norm
    t = optimize(function(t, func, fp, vec, bas) 
        func(bas - t*vec),
            c(0,step), tol=tol, fp = fprim, func = ffcn, 
                 vec=unitnew, bas=xin)$min
    xout=xin-t*unitnew
    list(xnew = xout, fval = ffcn(xout), 
         fprim = fprim(xout) ) }

## Here is a little test of the function on the function
##  and gradient:

> f1
function(x) x[1]^2 + 5*x[1]*x[2] + 10*x[2]^2 + 4*x[1]
> fp1
function(x) c(2*x[1]+5*x[2]+4, 5*x[1]+20*x[2])

## Test follows:
  xin = c(2,3)
  for(i in 1:10) {
     aux = GradStep(f1, fp1, xin, 10)
     xin = aux$xnew
     cat(newx = round(xin, 5), obj= aux$fvla, 
         grad= aux$fprim,"\n")   }
> xin = c(2,3)
>   for(i in 1:10) {
+      aux = GradStep(f1, fp1, xin, 10)
+      xin = aux$xnew
+      cat(newx = round(xin, 5), obj= aux$fvla, 
+          grad= aux$fprim,"\n")   }
0.91569 -0.30007 4.331006 -1.423045 
-4.57801 1.505 2.368971 7.209911 
-4.68969 1.16509 0.4460881 -0.1465718 
-5.25554 1.35101 0.2440010 0.7426116 
-5.26704 1.316 0.04594651 -0.01509671 
-5.32532 1.33515 0.02513179 0.07648805 
-5.32651 1.33155 0.004732432 -0.001554942 
-5.33251 1.33352 0.002588542 0.007878172 
-5.33263 1.33315 0.0004874344 -0.0001601570 
-5.33325 1.33335 0.0002666166 0.0008114418 

## Now we put together code for a function doing 
##  the previous step-length optimized gradient 
##  search until the gradient vector norm is small 
##  enough or the iterates are exhausted.

GradOpt = function(ffcn, fprim, xzer, step, 
             opttol=1e-5, grdtol=1e-5, iter=20) {
   grad=replace(xzer,T,1)
   xin = xzer
   if(length(step)==1) step=rep(step,iter) 
   count=0
   while (sqrt(sum(grad^2)) > grdtol & count < iter) {
      count=count+1
      aux = GradStep(ffcn, fprim, xin, step[count])
      xin = aux$xnew
      grad = aux$fprim }
   list(maxit = iter, iter=count, newx = xin, grad=grad) }

## Here is the implementation on the same little 2-dim
##   optimization problem.

> GradOpt(f1, fp1, c(2,3), 10)
$maxit
[1] 20

$iter
[1] 20

$newx
[1] -5.333328  1.333335

$grad
[1] 1.843724e-05 5.819765e-05

GradOpt(f1, fp1, c(1,1), 10, grdtol=1e-6, 
          opttol=1e-7, iter=100)

> GradOpt(f1, fp1, c(1,1), 10, grdtol=1e-6, 
+           opttol=1e-7, iter=100)
$maxit
[1] 100

$iter
[1] 100

$newx
[1] -5.333334  1.333330

$grad
[1] -1.808589e-05 -7.023544e-05

### Compare these results so far with the nlm minimum.

> nlm(f1, c(2,3))
$minimum
[1] -10.66667
$estimate
[1] -5.333333  1.333333
$gradient
[1] 0 0
$code
[1] 1
$iterations
[1] 4


### We can see that the Gradient Search technique
###   can be made to converge by careful choice of
###   tolerances in the optimize routine, but nlm
###   is far quicker for nice functions !


### Now try a more complex (10-parameter) statistical example, 
### using dataset "Insurance" in library MASS.

> library(MASS); attach(Insurance)
> glmI = glm(cbind(Claims, Holders-Claims) ~ 
          District + Group + Age -1, family=binomial)
> glmI$coef
   District1    District2    District3    District4      Group.L      Group.Q 
-1.623017073 -1.592949255 -1.578139646 -1.346033246  0.505578250  0.011576312 
     Group.C        Age.L        Age.Q        Age.C 
-0.033922126 -0.471033518  0.005142243 -0.019659119 

>  Xmat = model.matrix(glmI)    ### 64 x 10
   NeglLik = function(b) { 
       eta = c(Xmat %*% b)
       -sum(Claims*eta) + sum(Holders*log(1+exp(eta))) }
   NeglLp = function(b) {
       eta = c(Xmat %*% b)
       -c(t(Xmat) %*% (Claims - Holders*plogis(eta))) } 


> GradOpt(NeglLik, NeglLp, rep(0,10), 10, iter=100) 
$maxit
[1] 100

$iter
[1] 100

$newx
 [1] -1.623016846 -1.592949144 -1.578139583 -1.346033215  0.505578191
 [6]  0.011576173 -0.033922019 -0.471033265  0.005142423 -0.019659036

$grad
 [1]  0.0005007822  0.0002436186  0.0001388209  0.0000671182 -0.0001291210
 [6] -0.0003053821  0.0002371743  0.0005593311  0.0003976299  0.0001822663

### So the Gradient search does very well (in 100 iterations !) at 
##  finding the  same maximum as in  glm.