Log Illustrating Minimization                   
=============================  including pieces from 2008, current as of Fall 2017

We have three functions to use in numerical minimizations:

--- optimize         [ one-dimensional argument ]

ptimize(f = , interval = , lower = min(interval),
         upper = max(interval), maximum = FALSE,
         tol = .Machine$double.eps^0.25, ...)

--- nlm              [ multi-dimensional: do NOT need to supply
                       derivatives, but CAN supply "gradient" 
                       and if desired "hessian" attributes
                       for prurposes of speedup. ]

nlm(f, p, hessian = FALSE, typsize=rep(1, length(p)), fscale=1,
    print.level = 0, ndigit=12, gradtol = 1e-6,
    stepmax = max(1000 * sqrt(sum((p/typsize)^2)), 1000),
    steptol = 1e-6, iterlim = 100, check.analyticals = TRUE, ...)

--- nlminb           [multidimensional with "box constraints"]

nlminb(start, objective, gradient = NULL, hessian = NULL,
       scale = 1, control = list(), lower = -Inf, upper = Inf, ...)

===========================================================
> ft
function(x,a) { res= sum((x-a)^4)
    attr(res, "gradient") = 4*(x-a)^3
    attr(res,"hessian") = 12*diag((x-a)^2)
    res }

### attr refers to object attributes by name
###   (usually to define them), for example

> attr(table(rpois(10,3)),"names")
[1] "1" "3" "4" "5"

## or 

> attr(factor(sample(1:5,20,rep=T)),"levels")
[1] "1" "2" "3" "4" "5"

====================================================

> ft(c(3,4),c(2,5))
[1] 2
attr(,"gradient")
[1]  4 -4
attr(,"hessian")
     [,1] [,2]
[1,]   12    0
[2,]    0   12


> nlm(ft, c(0,0), a = c(2,5))
$minimum
[1] 1.343654e-10

$estimate
[1] 1.998647 4.996617

$gradient
[1] -9.913385e-09 -1.548966e-07

$code
[1] 1

$iterations
[1] 18

==========================================================
EXAMINE THE EVALUATION OF MAXIMA FOR LIKELIHOOD-LIKE 
FUNCTIONS WITH AND WITHOUT GRADIENT AND HESSIAN ATTRIBUTES
==========================================================

### Simple case:  2500 Weibull observatins with shape 1.6, scale 0.4

xdat = rweibull(2500,1.6,0.4)

logLWeib0 = function (par2,dvec) {
      n = length(dvec)
      a = par2[1]
      b = par2[2]
      -log(a)+a*log(b) - (a-1)*mean(log(dvec)) + 
           mean(dvec^a)/b^a }

unix.time({ tmp = nlm(logLWeib0, c(1,1), dvec=xdat) })
   user  system elapsed 
   0.05    0.00    0.04 
> str(tmp)
List of 5
 $ minimum   : num -0.145
 $ estimate  : num [1:2] 1.559 0.403
 $ gradient  : num [1:2] -2.39e-07 1.14e-07
 $ code      : int 1
 $ iterations: int 8
## gradients of order 1e-7: NB accurate parameter est's!

logLWeib1 = function (par2,dvec) {
      n = length(dvec)
      a = par2[1]
      b = par2[2]
      obj = -log(a)+a*log(b) - (a-1)*mean(log(dvec)) + 
           mean(dvec^a)/b^a
      attr(obj,"gradient") = c(-1/a + log(b) - 
            mean(log(dvec)) + mean(log(dvec/b)*dvec^a)/b^a,
            a/b- a*mean(dvec^a)/b^(a+1))   
      obj }

> logLWeib0(c(1,1),xdat)
[1]  0.3620798
> logLWeib1(c(1,1),xdat)
[1]  0.3620798
attr(,"gradient")
[1] -0.009937885  0.637920180

> (logLWeib0(c(1+1e-6,1),xdat)-logLWeib0(c(1-1e-6,1),xdat))/2e-6
[1] -0.009937885
> (logLWeib0(c(1,1+1e-6),xdat)-logLWeib0(c(1,1-1e-6),xdat))/2e-6
[1] 0.6379202

unix.time({ tmp = nlm(logLWeib1, c(1,1), dvec=xdat) })
   user  system elapsed 
   0.07    0.00    0.07 
### same result but longer computation ! 

unix.time({ tmp = nlm(logLWeib0, c(1.6,0.4), dvec=xdat) })
   user  system elapsed 
   0.06    0.00    0.06   ## Note longer time, more iterations !
> str(tmp)
List of 5
 $ minimum   : num -0.145
 $ estimate  : num [1:2] 1.559 0.403
 $ gradient  : num [1:2] -4.17e-08 3.61e-07
 $ code      : int 1
 $ iterations: int 10


logLWeib2 = function (par2,dvec) {
      n = length(dvec)
      a = par2[1]
      b = par2[2]
      obj = -log(a)+a*log(b) - (a-1)*mean(log(dvec)) + 
           mean(dvec^a)/b^a
         attr(obj,"gradient") = c(-1/a + log(b) - 
            mean(log(dvec)) + mean(log(dvec/b)*dvec^a)/b^a,
            a/b- a*mean(dvec^a)/b^(a+1))   
      attr(obj,"hessian") = array(c(1/a^2 + 
            mean(log(dvec/b)^2*dvec^a)/b^a, rep(1/b-
            mean(dvec^a)/b^(a+1)-(a/b^(a+1))*mean(log(dvec/b)*
            dvec^a),2), -a/b^2 + (a*(a+1)/b^(a+2))*mean(dvec^a)),
            c(2,2))
      obj }


> logLWeib2(c(1.6,0.4),xdat)
[1] -0.1445586
attr(,"gradient")
[1]  0.03327704 -0.09032663
attr(,"hessian")
          [,1]      [,2]
[1,]  0.732797 -1.228535
[2,] -1.228535 16.587123


> (attr(logLWeib1(c(1.6+1e-6,.4),xdat),"gradient")-
+  attr(logLWeib1(c(1.6-1e-6,.4),xdat),"gradient"))/2e-6
[1]  0.732797 -1.228535
> (attr(logLWeib1(c(1.6,.4+1e-6),xdat),"gradient")-
+  attr(logLWeib1(c(1.6,.4-1e-6),xdat),"gradient"))/2e-6
[1] -1.228535 16.587123


> logLWeib2(c(1,1),xdat)
[1] 0.3620798
attr(,"gradient")
[1] -0.009937885  0.637920180
attr(,"hessian")
         [,1]       [,2]
[1,] 1.357504  1.2914855
[2,] 1.291486 -0.2758404     ### Note not pos-def at (1,1) !

unix.time({ tmp = nlm(logLWeib2, c(1,1), hessian=T, dvec=xdat) })
   user  system elapsed 
   0.29    0.00    0.29 
> str(tmp)
List of 6
 $ minimum   : num -0.145
 $ estimate  : num [1:2] 1.559 0.403
 $ gradient  : num [1:2] 1.88e-12 -1.75e-07
 $ hessian   : num [1:2, 1:2] 0.741 -1.041 -1.041 14.966
 $ code      : int 1
 $ iterations: int 8
## Note more accurate answer , gradient smaller

attr(logLWeib2(c(tmp$est),xdat),"hessian")
          [,1]      [,2]
[1,]  0.740968 -1.041975
[2,] -1.041975 14.983215
tmp$hessian
           [,1]      [,2]
[1,]  0.7409032 -1.040887
[2,] -1.0408871 14.966268    ## NOTE not exactly the same !

### One check on the validity of the model assumption 
### is to compare the Fisher Info found here with the
### sum of gradients time their transposes: the coding 
### is as follows:

> WeibScor = function(par2,dvec) {
      n = length(dvec)
      a = par2[1]
      b = par2[2]
      scor = cbind(-1/a + log(b) - log(dvec) + 
           log(dvec/b)*(dvec/b)^a, a/b- (a/b^(a+1))*dvec^a)
      list(scor=scor,  gradllik = c(t(scor) %*% rep(1,n)),
           altInfo = t(scor) %*% scor/n) }


> xdat = rweibull(2500,1.6,0.4)
> logLWeib2(c(1.6,0.4),xdat)
[1] -0.1710576
attr(,"gradient")
[1] -0.004228303 -0.005230492
attr(,"hessian")
          [,1]      [,2]
[1,]  0.713670 -1.060161
[2,] -1.060161 16.033998

> unlist(WeibScor(c(1.6,0.4),xdat)[2:3])
  gradllik1   gradllik2    altInfo1    altInfo2    altInfo3    altInfo4
-10.5707566 -13.0762305   0.7369663  -1.1290419  -1.1290419  16.0853598


========================================
Listings for Do-it Yourself Optimization
========================================

> Gradmat
function(parvec, infcn, eps = 1e-06)
{
# Function to calculate the difference-quotient approximate gradient
#   (matrix) of an arbitrary input (vector) function     infcn
# Now recoded to use central differences !
        dd <- length(parvec)
        aa <- length(infcn(parvec))
        epsmat <- (diag(dd) * eps)/2
        gmat <- array(0, dim = c(aa, dd))
        for(i in 1:dd)
                gmat[, i] <- (infcn(parvec + epsmat[, i]) - infcn(parvec - 
                        epsmat[, i]))/eps
        if(aa > 1)
                gmat
        else c(gmat)
}


NRroot
function(inipar, infcn, nmax = 25, stoptol = 1e-05, 
    eps = 1e-06, gradfunc = NULL)
{
### This function implements a simple Newton-Raphson iteration
## to find the root  x  of the (vector) function  infcn(x), 
## but for the solution to be well-defined and for NRroot to
## work properly, the dimension  q  of the function values  infcn(x)
## and of the parameter  x should both be the same.  In this case, 
## if the gradient   gradfunc  is supplied, it should be the qxq 
## Jacobian matrix of the function  infcn  with respect to its 
## arguments.         Function checked/updated as of 3/31/03
    assign("Infcn", infcn, frame = 0)
    assign("Eps", eps, frame = 0)
    if(is.null(gradfunc))
        gradfunc <- function(x)
        Gradmat(x, Infcn, Eps)
    ctr <- 0
    newpar <- inipar
    oldpar <- inipar - 1
    while(ctr < nmax & sqrt(sum((newpar - oldpar)^2)) > stoptol) {
        oldpar <- newpar
        newpar <- oldpar - solve(gradfunc(oldpar), infcn(oldpar))
        ctr <- ctr + 1
    }
    list(nstep = ctr, initial = inipar, final = newpar, 
        funcval = infcn(newpar))
}

> GradSrch 
function(inipar, infcn, step, nmax = 25, stoptol = 1e-05, 
        unitfac = F, eps = 1e-06, gradfunc = NULL)
{
# Function to implement Steepest-descent. The   unitfac  condition
#   indicates whether or not the supplied step-length factor(s) multiply
#  the negative gradient itself, or the unit vector in the same direction.
   assign("Infcn", infcn, frame = 0)
   assign("Eps", eps, frame = 0)
   if(is.null(gradfunc))
         gradfunc <- function(x)
         Gradmat(x, Infcn, Eps)
   steps <- if(length(step) > 1) step else rep(step, nmax)
   newpar <- inipar
   oldpar <- newpar - 1
   ctr <- 0
   while(ctr < nmax & sqrt(sum((newpar - oldpar)^2)) > stoptol) {
         ctr <- ctr + 1
         oldpar <- newpar
         newstep <- gradfunc(oldpar)
         newstep <- if(unitfac) newstep/sum(newstep^2) else newstep
         newpar <- oldpar - steps[ctr] * newstep
   }
   list(nstep = ctr, initial = inipar, final = newpar, 
         funcval = infcn(newpar))
}