From c11c6b2301170574c1bcf23208df37931e48bbab Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Sat, 26 Oct 2024 06:34:32 -0500
Subject: [PATCH 1/2] Conv. criteria: add per-component g_abstol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows `options.g_abstol` to be either a scalar (the traditional
choice) or a vector. The motivation for this change comes from the fact
that the gradient is dependent on how each variable is scaled:

     f₁(x, y) = x^2 + y^2 + 1

and

     f₂(x, y) = x^2 + 10^8 * y^2 + 1

Both have the same minimizer and minimum value, but at any point other
than the x-axis, derivatives with respect to `y` of `f₂` are much larger
than those of `f₁`. When roundoff error becomes a factor, the
convergence requirement should be adjusted accordingly.
---
 docs/src/user/config.md                       |  2 +-
 src/multivariate/optimize/optimize.jl         |  8 +++--
 .../solvers/constrained/fminbox.jl            |  3 +-
 .../solvers/constrained/ipnewton/interior.jl  | 10 +++---
 src/types.jl                                  | 14 +++++----
 src/utilities/assess_convergence.jl           | 31 ++++++++++++++++---
 src/utilities/generic.jl                      |  3 ++
 test/general/api.jl                           | 18 +++++++++++
 test/general/convergence.jl                   | 16 +++++++++-
 9 files changed, 85 insertions(+), 20 deletions(-)

diff --git a/docs/src/user/config.md b/docs/src/user/config.md
index a67271e83..5126adaf8 100644
--- a/docs/src/user/config.md
+++ b/docs/src/user/config.md
@@ -40,7 +40,7 @@ In addition to the solver, you can alter the behavior of the Optim package by us
 
 * `x_tol`: Absolute tolerance in changes of the input vector `x`, in infinity norm. Defaults to `0.0`.
 * `f_tol`: Relative tolerance in changes of the objective value. Defaults to `0.0`.
-* `g_tol`: Absolute tolerance in the gradient, in infinity norm. Defaults to `1e-8`. For gradient free methods, this will control the main convergence tolerance, which is solver specific.
+* `g_tol`: Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner.
 * `f_calls_limit`: A soft upper limit on the number of objective calls. Defaults to `0` (unlimited).
 * `g_calls_limit`: A soft upper limit on the number of gradient calls. Defaults to `0` (unlimited).
 * `h_calls_limit`: A soft upper limit on the number of Hessian calls. Defaults to `0` (unlimited).
diff --git a/src/multivariate/optimize/optimize.jl b/src/multivariate/optimize/optimize.jl
index 5c17faf71..c7e2ebe0c 100644
--- a/src/multivariate/optimize/optimize.jl
+++ b/src/multivariate/optimize/optimize.jl
@@ -24,7 +24,10 @@ after_while!(d, state, method, options) = nothing
 function initial_convergence(d, state, method::AbstractOptimizer, initial_x, options)
     gradient!(d, initial_x)
     stopped = !isfinite(value(d)) || any(!isfinite, gradient(d))
-    maximum(abs, gradient(d)) <= options.g_abstol, stopped
+    g_abstol = options.g_abstol
+    conv = isa(g_abstol, Real) ? maximum(abs, gradient(d)) <= options.g_abstol :                    # scalar tolerance
+                                 all(t -> ((g, tol) = t; abs(g) <= tol), zip(gradient(d), g_abstol))  # per-component tolerance
+    return conv, stopped
 end
 function initial_convergence(d, state, method::ZerothOrderOptimizer, initial_x, options)
     false, false
@@ -133,8 +136,7 @@ function optimize(d::D, initial_x::Tx, method::M,
                                         f_abschange(d, state),
                                         f_relchange(d, state),
                                         g_converged,
-                                        Tf(options.g_abstol),
-                                        g_residual(d, state),
+                                        g_converge_component(options.g_abstol, d, state)...,
                                         f_increased,
                                         tr,
                                         f_calls(d),
diff --git a/src/multivariate/solvers/constrained/fminbox.jl b/src/multivariate/solvers/constrained/fminbox.jl
index a91f3efff..5258a01ba 100644
--- a/src/multivariate/solvers/constrained/fminbox.jl
+++ b/src/multivariate/solvers/constrained/fminbox.jl
@@ -437,11 +437,12 @@ function optimize(
                  callback=stopped_by_callback,
                  f_increased=f_increased && !options.allow_f_increases)
 
+    g_abstol, gabs = g_converge_component(options.g_abstol, df, state)
     return MultivariateOptimizationResults(F, initial_x, minimizer(results), df.f(minimizer(results)),
             iteration, results.iteration_converged,
             results.x_converged, results.x_abstol, results.x_reltol, norm(x - xold), norm(x - xold)/norm(x),
             results.f_converged, results.f_abstol, results.f_reltol, f_abschange(minimum(results), value(dfbox)), f_relchange(minimum(results), value(dfbox)),
-            results.g_converged, results.g_abstol, norm(g, Inf),
+            results.g_converged, T(g_abstol), gabs,
             results.f_increased, results.trace, results.f_calls,
             results.g_calls, results.h_calls, nothing,
             options.time_limit,
diff --git a/src/multivariate/solvers/constrained/ipnewton/interior.jl b/src/multivariate/solvers/constrained/ipnewton/interior.jl
index 3406f3c2f..b49df7f70 100644
--- a/src/multivariate/solvers/constrained/ipnewton/interior.jl
+++ b/src/multivariate/solvers/constrained/ipnewton/interior.jl
@@ -182,7 +182,10 @@ function initial_convergence(d, state, method::ConstrainedOptimizer, initial_x,
     # state.bgrad normally comes from constraints.c!(..., initial_x) in initial_state
     gradient!(d, initial_x)
     stopped = !isfinite(value(d)) || any(!isfinite, gradient(d))
-    norm(gradient(d), Inf) + norm(state.bgrad, Inf) < options.g_abstol, stopped
+    g_abstol = options.g_abstol
+    conv = isa(g_abstol, Real) ? norm(gradient(d), Inf) + norm(state.bgrad, Inf) <= options.g_abstol :                # scalar tolerance
+                                 all((g, bg, tol) -> abs(g) + abs(bg) <= tol, zip(gradient(d), state.bgrad, g_abstol))     # per-component tolerance
+    return conv, stopped
 end
 
 function optimize(f, g, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer=IPNewton(),
@@ -203,7 +206,7 @@ end
 function optimize(d, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer,
                   options::Options = Options(;default_options(method)...))
     twicediffed = d isa TwiceDifferentiable ? d : TwiceDifferentiable(d, initial_x)
-    
+
     bounds = ConstraintBounds(lower, upper, [], [])
     constraints = TwiceDifferentiableConstraints(
             (c,x)->nothing, (J,x)->nothing, (H,x,λ)->nothing, bounds)
@@ -311,8 +314,7 @@ function optimize(d::AbstractObjective, constraints::AbstractConstraints, initia
                                         f_abschange(d, state),
                                         f_relchange(d, state),
                                         g_converged,
-                                        T(options.g_abstol),
-                                        g_residual(d),
+                                        g_converge_component(options.g_abstol, d, state)...,
                                         f_increased,
                                         tr,
                                         f_calls(d),
diff --git a/src/types.jl b/src/types.jl
index bf33241b7..98bcfab0e 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -15,7 +15,7 @@ x_abstol::Real = 0.0,
 x_reltol::Real = 0.0,
 f_abstol::Real = 0.0,
 f_reltol::Real = 0.0,
-g_abstol::Real = 1e-8,
+g_abstol::Real = 1e-8,     # alternatively, supply per-component vector of tolerances
 g_reltol::Real = 1e-8,
 outer_x_abstol::Real = 0.0,
 outer_x_reltol::Real = 0.0,
@@ -39,14 +39,14 @@ show_every::Int = 1,
 callback = nothing,
 time_limit = NaN
 ```
-See http://julianlsolvers.github.io/Optim.jl/stable/#user/config/
+See http://julianlsolvers.github.io/Optim.jl/stable/user/config/
 """
 struct Options{T, TCallback}
     x_abstol::T
     x_reltol::T
     f_abstol::T
     f_reltol::T
-    g_abstol::T
+    g_abstol::Union{T,Vector{T}}
     g_reltol::T
     outer_x_abstol::T
     outer_x_reltol::T
@@ -80,7 +80,7 @@ function Options(;
         x_reltol::Real = 0.0,
         f_abstol::Real = 0.0,
         f_reltol::Real = 0.0,
-        g_abstol::Real = 1e-8,
+        g_abstol::Union{Real,AbstractVector{<:Real}} = 1e-8,
         g_reltol::Real = 1e-8,
         outer_x_tol = nothing,
         outer_f_tol = nothing,
@@ -129,7 +129,9 @@ function Options(;
     if !(outer_f_tol === nothing)
         outer_f_reltol = outer_f_tol
     end
-    Options(promote(x_abstol, x_reltol, f_abstol, f_reltol, g_abstol, g_reltol, outer_x_abstol, outer_x_reltol, outer_f_abstol, outer_f_reltol, outer_g_abstol, outer_g_reltol)..., f_calls_limit, g_calls_limit, h_calls_limit,
+    scalars = promote(f_abstol, f_reltol, outer_f_abstol, outer_f_reltol, x_reltol, g_reltol, outer_x_reltol, outer_g_reltol)
+    T = promote_type(eltype(scalars), eltype(x_abstol), eltype(g_abstol), eltype(outer_x_abstol), eltype(outer_g_abstol))
+    Options{T, typeof(callback)}(x_abstol, x_reltol, f_abstol, f_reltol, to_eltype(T, g_abstol), g_reltol, outer_x_abstol, outer_x_reltol, outer_f_abstol, outer_f_reltol, outer_g_abstol, outer_g_reltol, f_calls_limit, g_calls_limit, h_calls_limit,
         allow_f_increases, allow_outer_f_increases, successive_f_tol, Int(iterations), Int(outer_iterations), store_trace, trace_simplex, show_trace, extended_trace, show_warnings,
         Int(show_every), callback, Float64(time_limit))
 end
@@ -196,7 +198,7 @@ mutable struct MultivariateOptimizationResults{O, Tx, Tc, Tf, M, Tls, Tsb} <: Op
     f_abschange::Tc
     f_relchange::Tc
     g_converged::Bool
-    g_abstol::Tf
+    g_abstol::Tf            # while this might be a vector, we store the component with largest violation
     g_residual::Tc
     f_increased::Bool
     trace::M
diff --git a/src/utilities/assess_convergence.jl b/src/utilities/assess_convergence.jl
index f6b8d7ce6..1609eea13 100644
--- a/src/utilities/assess_convergence.jl
+++ b/src/utilities/assess_convergence.jl
@@ -13,7 +13,27 @@ g_residual(d, state::NelderMeadState) = state.nm_x
 g_residual(d::AbstractObjective) = g_residual(gradient(d))
 g_residual(d::NonDifferentiable) = convert(typeof(value(d)), NaN)
 g_residual(g) = maximum(abs, g)
-gradient_convergence_assessment(state::AbstractOptimizerState, d, options) = g_residual(gradient(d)) ≤ options.g_abstol
+
+function g_converge_component(g_tol::AbstractVector, g)
+    absratio(num, denom) = (iszero(num) && iszero(denom)) ? zero(num/denom) : abs(num)/denom
+
+    imax, maxratio = firstindex(g) - 1, typemin(first(g) / first(g_tol))
+    for i in eachindex(g_tol, g)
+        ratio = absratio(g[i], g_tol[i])
+        if ratio > maxratio
+            imax, maxratio = i, ratio
+        end
+    end
+    return g_tol[imax], abs(g[imax])
+end
+g_converge_component(g_tol, g) = g_tol, g_residual(g)
+g_converge_component(g_tol::AbstractVector, d, state) = g_converge_component(g_tol, gradient(d))
+g_converge_component(g_tol, d, state) = g_tol, g_residual(d, state)
+
+function gradient_convergence_assessment(state::AbstractOptimizerState, d, options)
+    g_tol, gnorm = g_converge_component(options.g_abstol, d, state)
+    return gnorm ≤ g_tol
+end
 gradient_convergence_assessment(state::ZerothOrderState, d, options) = false
 
 # Default function for convergence assessment used by
@@ -56,7 +76,8 @@ function assess_convergence(x, x_previous, f_x, f_x_previous, gx, x_abstol, x_re
         f_increased = true
     end
 
-    g_converged = g_residual(gx) ≤ g_abstol
+    g_converged = isa(g_abstol, Real) ? g_residual(gx) ≤ g_abstol :
+                                        all(t -> abs(t[1]) ≤ t[2], zip(gx, g_abstol))
 
     return x_converged, f_converged, g_converged, f_increased
 end
@@ -88,8 +109,10 @@ function assess_convergence(x,
         f_increased = true
     end
 
-    if g_residual(g) ≤ g_tol
-        g_converged = true
+    if isa(g_tol, Real)
+        g_converged = g_residual(g) ≤ g_tol
+    else
+        g_converged = all(t -> abs(t[1]) ≤ t[2], zip(g, g_tol))
     end
 
     return x_converged, f_converged, g_converged, f_increased
diff --git a/src/utilities/generic.jl b/src/utilities/generic.jl
index fc677c524..b4fbd6287 100644
--- a/src/utilities/generic.jl
+++ b/src/utilities/generic.jl
@@ -15,3 +15,6 @@ end
     (similar(initial_x), # Buffer of x for line search in state.x_ls
     real(one(T)))             # Keep track of step size in state.alpha
 end
+
+to_eltype(::Type{T}, x::Real) where T = T(x)
+to_eltype(::Type{T}, x::AbstractArray) where T = convert(AbstractVector{T}, x)
diff --git a/test/general/api.jl b/test/general/api.jl
index 5fb82cac4..251bea5cc 100644
--- a/test/general/api.jl
+++ b/test/general/api.jl
@@ -102,6 +102,24 @@
                    initial_x,
                    Newton(),
                    options_g)
+
+    options_gvec = Optim.Options(g_abstol = [1e-4, 1e-8], g_reltol = 0)
+
+    for method in (BFGS(), LBFGS(), Newton())
+        res = optimize(f, g!, h!,
+                       initial_x,
+                       method,
+                       options_gvec)
+        @test Optim.g_converged(res)
+        @test all(abs.(Optim.gradient(d2, res.minimizer)) .≤ options_gvec.g_abstol)
+        str = sprint(show, res)
+        @test occursin(r"|g(x)|.*≤ 1.0e-04", str) || occursin(r"|g(x)|.*≤ 1.0e-08", str)
+        @test occursin(r"|x - x'| .*≰", str)
+        @test occursin(r"|x - x'|/|x'|.*≰", str)
+        @test occursin(r"|f(x) - f(x')| .*≰", str)
+        @test occursin(r"|f(x) - f(x')|/|f(x')|.*≰", str)
+    end
+
     options_sa = Optim.Options(iterations = 10, store_trace = true,
                                show_trace = false)
     res = optimize(f, g!, h!,
diff --git a/test/general/convergence.jl b/test/general/convergence.jl
index 5d0e5b20c..2ee1f2b25 100644
--- a/test/general/convergence.jl
+++ b/test/general/convergence.jl
@@ -37,6 +37,9 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end
     g_tol = 1e-6
     g_abstol = 1e-6
     @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, false)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, [g_tol]) == (true, true, true, false)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol/1000) == (true, true, false, false)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, [g_tol/1000]) == (true, true, false, false)
     # f_increase
     f0, f1 = 1.0, 1.0 + 1e-7
     @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, true)
@@ -46,8 +49,19 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end
 
     @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, false, true, true)
 
+    # Implementation with both abstol and reltol
+    f_tol = 1e-6
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol, 0, f_tol, g_tol) == (true, true, true, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol, 0, f_tol, [g_tol]) == (true, true, true, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, 0, f_tol, 0, g_tol) == (true, true, true, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, 0, f_tol, 0, [g_tol]) == (true, true, true, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol/1000, 0, f_tol/1000, g_tol/1000) == (false, false, false, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol/1000, 0, f_tol/1000, [g_tol/1000]) == (false, false, false, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol/1000, 0, f_tol/1000, 0, g_tol/1000) == (false, false, false, true)
+    @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol/1000, 0, f_tol/1000, 0, [g_tol/1000]) == (false, false, false, true)
+
     f_tol = 1e-6 # rel tol
-    dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol)
+    dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol)    # FIXME: this isn't used?
     @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, true)
 
     f0, f1 = 1.0, 1.0 - 1e-7

From 47f22774b4100c822a3a2c8e067793139b423a97 Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Sun, 27 Oct 2024 13:41:51 -0500
Subject: [PATCH 2/2] Expand documentation on convergence options

Fixes #1102
---
 docs/src/user/config.md       | 11 +++++++----
 docs/src/user/minimization.md | 14 +++++++++++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/docs/src/user/config.md b/docs/src/user/config.md
index 5126adaf8..9d67fa747 100644
--- a/docs/src/user/config.md
+++ b/docs/src/user/config.md
@@ -35,12 +35,15 @@ Special methods for bounded univariate optimization:
 * `Brent()`
 * `GoldenSection()`
 
-### General Options
+### [General Options](@id config-general)
+
 In addition to the solver, you can alter the behavior of the Optim package by using the following keywords:
 
-* `x_tol`: Absolute tolerance in changes of the input vector `x`, in infinity norm. Defaults to `0.0`.
-* `f_tol`: Relative tolerance in changes of the objective value. Defaults to `0.0`.
-* `g_tol`: Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner.
+* `x_tol` (alternatively, `x_abstol`): Absolute tolerance in changes of the input vector `x`, in infinity norm. Concretely, if `|x-x'| ≤ x_tol` on successive evaluation points `x` and `x'`, convergence is achieved. Defaults to `0.0`.
+* `x_reltol`: Relative tolerance in changes of the input vector `x`, in infinity norm. Concretely, if `|x-x'| ≤ x_reltol * |x|`, convergence is achieved. Defaults to `0.0`
+* `f_tol` (alternatively, `f_reltol`): Relative tolerance in changes of the objective value. Defaults to `0.0`.
+* `f_abstol`: Absolute tolerance in changes of the objective value. Defaults to `0.0`.
+* `g_tol` (alternatively, `g_abstol`): Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner.
 * `f_calls_limit`: A soft upper limit on the number of objective calls. Defaults to `0` (unlimited).
 * `g_calls_limit`: A soft upper limit on the number of gradient calls. Defaults to `0` (unlimited).
 * `h_calls_limit`: A soft upper limit on the number of Hessian calls. Defaults to `0` (unlimited).
diff --git a/docs/src/user/minimization.md b/docs/src/user/minimization.md
index 238ad591e..1a55a5886 100644
--- a/docs/src/user/minimization.md
+++ b/docs/src/user/minimization.md
@@ -31,6 +31,12 @@ For better performance and greater precision, you can pass your own gradient fun
 optimize(f, x0, LBFGS(); autodiff = :forward)
 ```
 
+!!! note
+    For most real-world problems, you may want to carefully consider the appropriate convergence criteria.
+    By default, algorithms that support gradients converge if `|g| ≤ 1e-8`. Depending on how your variables are scaled,
+    this may or may not be appropriate. See [configuration](@ref config-general) for more information about your options.
+    Examining traces (`Options(show_trace=true)`) during optimization may provide insight about when convergence is achieved in practice.
+
 For the Rosenbrock example, the analytical gradient can be shown to be:
 ```jl
 function g!(G, x)
@@ -65,7 +71,7 @@ Now we can use Newton's method for optimization by running:
 ```jl
 optimize(f, g!, h!, x0)
 ```
-Which defaults to `Newton()` since a Hessian function was provided. Like gradients, the Hessian function will be ignored if you use a method that does not require it:
+which defaults to `Newton()` since a Hessian function was provided. Like gradients, the Hessian function will be ignored if you use a method that does not require it:
 ```jl
 optimize(f, g!, h!, x0, LBFGS())
 ```
@@ -74,6 +80,12 @@ because of the potentially low accuracy of approximations to the Hessians. Other
 than Newton's method, none of the algorithms provided by the Optim package employ
 exact Hessians.
 
+As a reminder, it's advised to set your convergence criteria manually based on
+your knowledge of the problem:
+```
+optimize(f, g!, h!, x0, Optim.Options(g_tol = 1e-12))
+```
+
 ## Box Constrained Optimization
 
 A primal interior-point algorithm for simple "box" constraints (lower and upper bounds) is available. Reusing our Rosenbrock example from above, boxed minimization is performed as follows: