Log Illustrating Minimization ============================= including pieces from 2008, current as of Fall 2017 We have three functions to use in numerical minimizations: --- optimize [ one-dimensional argument ] ptimize(f = , interval = , lower = min(interval), upper = max(interval), maximum = FALSE, tol = .Machine$double.eps^0.25, ...) --- nlm [ multi-dimensional: do NOT need to supply derivatives, but CAN supply "gradient" and if desired "hessian" attributes for prurposes of speedup. ] nlm(f, p, hessian = FALSE, typsize=rep(1, length(p)), fscale=1, print.level = 0, ndigit=12, gradtol = 1e-6, stepmax = max(1000 * sqrt(sum((p/typsize)^2)), 1000), steptol = 1e-6, iterlim = 100, check.analyticals = TRUE, ...) --- nlminb [multidimensional with "box constraints"] nlminb(start, objective, gradient = NULL, hessian = NULL, scale = 1, control = list(), lower = -Inf, upper = Inf, ...) =========================================================== > ft function(x,a) { res= sum((x-a)^4) attr(res, "gradient") = 4*(x-a)^3 attr(res,"hessian") = 12*diag((x-a)^2) res } ### attr refers to object attributes by name ### (usually to define them), for example > attr(table(rpois(10,3)),"names") [1] "1" "3" "4" "5" ## or > attr(factor(sample(1:5,20,rep=T)),"levels") [1] "1" "2" "3" "4" "5" ==================================================== > ft(c(3,4),c(2,5)) [1] 2 attr(,"gradient") [1] 4 -4 attr(,"hessian") [,1] [,2] [1,] 12 0 [2,] 0 12 > nlm(ft, c(0,0), a = c(2,5)) $minimum [1] 1.343654e-10 $estimate [1] 1.998647 4.996617 $gradient [1] -9.913385e-09 -1.548966e-07 $code [1] 1 $iterations [1] 18 ========================================================== EXAMINE THE EVALUATION OF MAXIMA FOR LIKELIHOOD-LIKE FUNCTIONS WITH AND WITHOUT GRADIENT AND HESSIAN ATTRIBUTES ========================================================== ### Simple case: 2500 Weibull observatins with shape 1.6, scale 0.4 xdat = rweibull(2500,1.6,0.4) logLWeib0 = function (par2,dvec) { n = length(dvec) a = par2[1] b = par2[2] -log(a)+a*log(b) - (a-1)*mean(log(dvec)) + mean(dvec^a)/b^a } unix.time({ tmp = nlm(logLWeib0, c(1,1), dvec=xdat) }) user system elapsed 0.05 0.00 0.04 > str(tmp) List of 5 $ minimum : num -0.145 $ estimate : num [1:2] 1.559 0.403 $ gradient : num [1:2] -2.39e-07 1.14e-07 $ code : int 1 $ iterations: int 8 ## gradients of order 1e-7: NB accurate parameter est's! logLWeib1 = function (par2,dvec) { n = length(dvec) a = par2[1] b = par2[2] obj = -log(a)+a*log(b) - (a-1)*mean(log(dvec)) + mean(dvec^a)/b^a attr(obj,"gradient") = c(-1/a + log(b) - mean(log(dvec)) + mean(log(dvec/b)*dvec^a)/b^a, a/b- a*mean(dvec^a)/b^(a+1)) obj } > logLWeib0(c(1,1),xdat) [1] 0.3620798 > logLWeib1(c(1,1),xdat) [1] 0.3620798 attr(,"gradient") [1] -0.009937885 0.637920180 > (logLWeib0(c(1+1e-6,1),xdat)-logLWeib0(c(1-1e-6,1),xdat))/2e-6 [1] -0.009937885 > (logLWeib0(c(1,1+1e-6),xdat)-logLWeib0(c(1,1-1e-6),xdat))/2e-6 [1] 0.6379202 unix.time({ tmp = nlm(logLWeib1, c(1,1), dvec=xdat) }) user system elapsed 0.07 0.00 0.07 ### same result but longer computation ! unix.time({ tmp = nlm(logLWeib0, c(1.6,0.4), dvec=xdat) }) user system elapsed 0.06 0.00 0.06 ## Note longer time, more iterations ! > str(tmp) List of 5 $ minimum : num -0.145 $ estimate : num [1:2] 1.559 0.403 $ gradient : num [1:2] -4.17e-08 3.61e-07 $ code : int 1 $ iterations: int 10 logLWeib2 = function (par2,dvec) { n = length(dvec) a = par2[1] b = par2[2] obj = -log(a)+a*log(b) - (a-1)*mean(log(dvec)) + mean(dvec^a)/b^a attr(obj,"gradient") = c(-1/a + log(b) - mean(log(dvec)) + mean(log(dvec/b)*dvec^a)/b^a, a/b- a*mean(dvec^a)/b^(a+1)) attr(obj,"hessian") = array(c(1/a^2 + mean(log(dvec/b)^2*dvec^a)/b^a, rep(1/b- mean(dvec^a)/b^(a+1)-(a/b^(a+1))*mean(log(dvec/b)* dvec^a),2), -a/b^2 + (a*(a+1)/b^(a+2))*mean(dvec^a)), c(2,2)) obj } > logLWeib2(c(1.6,0.4),xdat) [1] -0.1445586 attr(,"gradient") [1] 0.03327704 -0.09032663 attr(,"hessian") [,1] [,2] [1,] 0.732797 -1.228535 [2,] -1.228535 16.587123 > (attr(logLWeib1(c(1.6+1e-6,.4),xdat),"gradient")- + attr(logLWeib1(c(1.6-1e-6,.4),xdat),"gradient"))/2e-6 [1] 0.732797 -1.228535 > (attr(logLWeib1(c(1.6,.4+1e-6),xdat),"gradient")- + attr(logLWeib1(c(1.6,.4-1e-6),xdat),"gradient"))/2e-6 [1] -1.228535 16.587123 > logLWeib2(c(1,1),xdat) [1] 0.3620798 attr(,"gradient") [1] -0.009937885 0.637920180 attr(,"hessian") [,1] [,2] [1,] 1.357504 1.2914855 [2,] 1.291486 -0.2758404 ### Note not pos-def at (1,1) ! unix.time({ tmp = nlm(logLWeib2, c(1,1), hessian=T, dvec=xdat) }) user system elapsed 0.29 0.00 0.29 > str(tmp) List of 6 $ minimum : num -0.145 $ estimate : num [1:2] 1.559 0.403 $ gradient : num [1:2] 1.88e-12 -1.75e-07 $ hessian : num [1:2, 1:2] 0.741 -1.041 -1.041 14.966 $ code : int 1 $ iterations: int 8 ## Note more accurate answer , gradient smaller attr(logLWeib2(c(tmp$est),xdat),"hessian") [,1] [,2] [1,] 0.740968 -1.041975 [2,] -1.041975 14.983215 tmp$hessian [,1] [,2] [1,] 0.7409032 -1.040887 [2,] -1.0408871 14.966268 ## NOTE not exactly the same ! ### One check on the validity of the model assumption ### is to compare the Fisher Info found here with the ### sum of gradients time their transposes: the coding ### is as follows: > WeibScor = function(par2,dvec) { n = length(dvec) a = par2[1] b = par2[2] scor = cbind(-1/a + log(b) - log(dvec) + log(dvec/b)*(dvec/b)^a, a/b- (a/b^(a+1))*dvec^a) list(scor=scor, gradllik = c(t(scor) %*% rep(1,n)), altInfo = t(scor) %*% scor/n) } > xdat = rweibull(2500,1.6,0.4) > logLWeib2(c(1.6,0.4),xdat) [1] -0.1710576 attr(,"gradient") [1] -0.004228303 -0.005230492 attr(,"hessian") [,1] [,2] [1,] 0.713670 -1.060161 [2,] -1.060161 16.033998 > unlist(WeibScor(c(1.6,0.4),xdat)[2:3]) gradllik1 gradllik2 altInfo1 altInfo2 altInfo3 altInfo4 -10.5707566 -13.0762305 0.7369663 -1.1290419 -1.1290419 16.0853598 ======================================== Listings for Do-it Yourself Optimization ======================================== > Gradmat function(parvec, infcn, eps = 1e-06) { # Function to calculate the difference-quotient approximate gradient # (matrix) of an arbitrary input (vector) function infcn # Now recoded to use central differences ! dd <- length(parvec) aa <- length(infcn(parvec)) epsmat <- (diag(dd) * eps)/2 gmat <- array(0, dim = c(aa, dd)) for(i in 1:dd) gmat[, i] <- (infcn(parvec + epsmat[, i]) - infcn(parvec - epsmat[, i]))/eps if(aa > 1) gmat else c(gmat) } NRroot function(inipar, infcn, nmax = 25, stoptol = 1e-05, eps = 1e-06, gradfunc = NULL) { ### This function implements a simple Newton-Raphson iteration ## to find the root x of the (vector) function infcn(x), ## but for the solution to be well-defined and for NRroot to ## work properly, the dimension q of the function values infcn(x) ## and of the parameter x should both be the same. In this case, ## if the gradient gradfunc is supplied, it should be the qxq ## Jacobian matrix of the function infcn with respect to its ## arguments. Function checked/updated as of 3/31/03 assign("Infcn", infcn, frame = 0) assign("Eps", eps, frame = 0) if(is.null(gradfunc)) gradfunc <- function(x) Gradmat(x, Infcn, Eps) ctr <- 0 newpar <- inipar oldpar <- inipar - 1 while(ctr < nmax & sqrt(sum((newpar - oldpar)^2)) > stoptol) { oldpar <- newpar newpar <- oldpar - solve(gradfunc(oldpar), infcn(oldpar)) ctr <- ctr + 1 } list(nstep = ctr, initial = inipar, final = newpar, funcval = infcn(newpar)) } > GradSrch function(inipar, infcn, step, nmax = 25, stoptol = 1e-05, unitfac = F, eps = 1e-06, gradfunc = NULL) { # Function to implement Steepest-descent. The unitfac condition # indicates whether or not the supplied step-length factor(s) multiply # the negative gradient itself, or the unit vector in the same direction. assign("Infcn", infcn, frame = 0) assign("Eps", eps, frame = 0) if(is.null(gradfunc)) gradfunc <- function(x) Gradmat(x, Infcn, Eps) steps <- if(length(step) > 1) step else rep(step, nmax) newpar <- inipar oldpar <- newpar - 1 ctr <- 0 while(ctr < nmax & sqrt(sum((newpar - oldpar)^2)) > stoptol) { ctr <- ctr + 1 oldpar <- newpar newstep <- gradfunc(oldpar) newstep <- if(unitfac) newstep/sum(newstep^2) else newstep newpar <- oldpar - steps[ctr] * newstep } list(nstep = ctr, initial = inipar, final = newpar, funcval = infcn(newpar)) }