From c11c6b2301170574c1bcf23208df37931e48bbab Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Sat, 26 Oct 2024 06:34:32 -0500 Subject: [PATCH 1/2] Conv. criteria: add per-component g_abstol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows `options.g_abstol` to be either a scalar (the traditional choice) or a vector. The motivation for this change comes from the fact that the gradient is dependent on how each variable is scaled: f₁(x, y) = x^2 + y^2 + 1 and f₂(x, y) = x^2 + 10^8 * y^2 + 1 Both have the same minimizer and minimum value, but at any point other than the x-axis, derivatives with respect to `y` of `f₂` are much larger than those of `f₁`. When roundoff error becomes a factor, the convergence requirement should be adjusted accordingly. --- docs/src/user/config.md | 2 +- src/multivariate/optimize/optimize.jl | 8 +++-- .../solvers/constrained/fminbox.jl | 3 +- .../solvers/constrained/ipnewton/interior.jl | 10 +++--- src/types.jl | 14 +++++---- src/utilities/assess_convergence.jl | 31 ++++++++++++++++--- src/utilities/generic.jl | 3 ++ test/general/api.jl | 18 +++++++++++ test/general/convergence.jl | 16 +++++++++- 9 files changed, 85 insertions(+), 20 deletions(-) diff --git a/docs/src/user/config.md b/docs/src/user/config.md index a67271e83..5126adaf8 100644 --- a/docs/src/user/config.md +++ b/docs/src/user/config.md @@ -40,7 +40,7 @@ In addition to the solver, you can alter the behavior of the Optim package by us * `x_tol`: Absolute tolerance in changes of the input vector `x`, in infinity norm. Defaults to `0.0`. * `f_tol`: Relative tolerance in changes of the objective value. Defaults to `0.0`. -* `g_tol`: Absolute tolerance in the gradient, in infinity norm. Defaults to `1e-8`. For gradient free methods, this will control the main convergence tolerance, which is solver specific. +* `g_tol`: Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner. * `f_calls_limit`: A soft upper limit on the number of objective calls. Defaults to `0` (unlimited). * `g_calls_limit`: A soft upper limit on the number of gradient calls. Defaults to `0` (unlimited). * `h_calls_limit`: A soft upper limit on the number of Hessian calls. Defaults to `0` (unlimited). diff --git a/src/multivariate/optimize/optimize.jl b/src/multivariate/optimize/optimize.jl index 5c17faf71..c7e2ebe0c 100644 --- a/src/multivariate/optimize/optimize.jl +++ b/src/multivariate/optimize/optimize.jl @@ -24,7 +24,10 @@ after_while!(d, state, method, options) = nothing function initial_convergence(d, state, method::AbstractOptimizer, initial_x, options) gradient!(d, initial_x) stopped = !isfinite(value(d)) || any(!isfinite, gradient(d)) - maximum(abs, gradient(d)) <= options.g_abstol, stopped + g_abstol = options.g_abstol + conv = isa(g_abstol, Real) ? maximum(abs, gradient(d)) <= options.g_abstol : # scalar tolerance + all(t -> ((g, tol) = t; abs(g) <= tol), zip(gradient(d), g_abstol)) # per-component tolerance + return conv, stopped end function initial_convergence(d, state, method::ZerothOrderOptimizer, initial_x, options) false, false @@ -133,8 +136,7 @@ function optimize(d::D, initial_x::Tx, method::M, f_abschange(d, state), f_relchange(d, state), g_converged, - Tf(options.g_abstol), - g_residual(d, state), + g_converge_component(options.g_abstol, d, state)..., f_increased, tr, f_calls(d), diff --git a/src/multivariate/solvers/constrained/fminbox.jl b/src/multivariate/solvers/constrained/fminbox.jl index a91f3efff..5258a01ba 100644 --- a/src/multivariate/solvers/constrained/fminbox.jl +++ b/src/multivariate/solvers/constrained/fminbox.jl @@ -437,11 +437,12 @@ function optimize( callback=stopped_by_callback, f_increased=f_increased && !options.allow_f_increases) + g_abstol, gabs = g_converge_component(options.g_abstol, df, state) return MultivariateOptimizationResults(F, initial_x, minimizer(results), df.f(minimizer(results)), iteration, results.iteration_converged, results.x_converged, results.x_abstol, results.x_reltol, norm(x - xold), norm(x - xold)/norm(x), results.f_converged, results.f_abstol, results.f_reltol, f_abschange(minimum(results), value(dfbox)), f_relchange(minimum(results), value(dfbox)), - results.g_converged, results.g_abstol, norm(g, Inf), + results.g_converged, T(g_abstol), gabs, results.f_increased, results.trace, results.f_calls, results.g_calls, results.h_calls, nothing, options.time_limit, diff --git a/src/multivariate/solvers/constrained/ipnewton/interior.jl b/src/multivariate/solvers/constrained/ipnewton/interior.jl index 3406f3c2f..b49df7f70 100644 --- a/src/multivariate/solvers/constrained/ipnewton/interior.jl +++ b/src/multivariate/solvers/constrained/ipnewton/interior.jl @@ -182,7 +182,10 @@ function initial_convergence(d, state, method::ConstrainedOptimizer, initial_x, # state.bgrad normally comes from constraints.c!(..., initial_x) in initial_state gradient!(d, initial_x) stopped = !isfinite(value(d)) || any(!isfinite, gradient(d)) - norm(gradient(d), Inf) + norm(state.bgrad, Inf) < options.g_abstol, stopped + g_abstol = options.g_abstol + conv = isa(g_abstol, Real) ? norm(gradient(d), Inf) + norm(state.bgrad, Inf) <= options.g_abstol : # scalar tolerance + all((g, bg, tol) -> abs(g) + abs(bg) <= tol, zip(gradient(d), state.bgrad, g_abstol)) # per-component tolerance + return conv, stopped end function optimize(f, g, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer=IPNewton(), @@ -203,7 +206,7 @@ end function optimize(d, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer, options::Options = Options(;default_options(method)...)) twicediffed = d isa TwiceDifferentiable ? d : TwiceDifferentiable(d, initial_x) - + bounds = ConstraintBounds(lower, upper, [], []) constraints = TwiceDifferentiableConstraints( (c,x)->nothing, (J,x)->nothing, (H,x,λ)->nothing, bounds) @@ -311,8 +314,7 @@ function optimize(d::AbstractObjective, constraints::AbstractConstraints, initia f_abschange(d, state), f_relchange(d, state), g_converged, - T(options.g_abstol), - g_residual(d), + g_converge_component(options.g_abstol, d, state)..., f_increased, tr, f_calls(d), diff --git a/src/types.jl b/src/types.jl index bf33241b7..98bcfab0e 100644 --- a/src/types.jl +++ b/src/types.jl @@ -15,7 +15,7 @@ x_abstol::Real = 0.0, x_reltol::Real = 0.0, f_abstol::Real = 0.0, f_reltol::Real = 0.0, -g_abstol::Real = 1e-8, +g_abstol::Real = 1e-8, # alternatively, supply per-component vector of tolerances g_reltol::Real = 1e-8, outer_x_abstol::Real = 0.0, outer_x_reltol::Real = 0.0, @@ -39,14 +39,14 @@ show_every::Int = 1, callback = nothing, time_limit = NaN ``` -See http://julianlsolvers.github.io/Optim.jl/stable/#user/config/ +See http://julianlsolvers.github.io/Optim.jl/stable/user/config/ """ struct Options{T, TCallback} x_abstol::T x_reltol::T f_abstol::T f_reltol::T - g_abstol::T + g_abstol::Union{T,Vector{T}} g_reltol::T outer_x_abstol::T outer_x_reltol::T @@ -80,7 +80,7 @@ function Options(; x_reltol::Real = 0.0, f_abstol::Real = 0.0, f_reltol::Real = 0.0, - g_abstol::Real = 1e-8, + g_abstol::Union{Real,AbstractVector{<:Real}} = 1e-8, g_reltol::Real = 1e-8, outer_x_tol = nothing, outer_f_tol = nothing, @@ -129,7 +129,9 @@ function Options(; if !(outer_f_tol === nothing) outer_f_reltol = outer_f_tol end - Options(promote(x_abstol, x_reltol, f_abstol, f_reltol, g_abstol, g_reltol, outer_x_abstol, outer_x_reltol, outer_f_abstol, outer_f_reltol, outer_g_abstol, outer_g_reltol)..., f_calls_limit, g_calls_limit, h_calls_limit, + scalars = promote(f_abstol, f_reltol, outer_f_abstol, outer_f_reltol, x_reltol, g_reltol, outer_x_reltol, outer_g_reltol) + T = promote_type(eltype(scalars), eltype(x_abstol), eltype(g_abstol), eltype(outer_x_abstol), eltype(outer_g_abstol)) + Options{T, typeof(callback)}(x_abstol, x_reltol, f_abstol, f_reltol, to_eltype(T, g_abstol), g_reltol, outer_x_abstol, outer_x_reltol, outer_f_abstol, outer_f_reltol, outer_g_abstol, outer_g_reltol, f_calls_limit, g_calls_limit, h_calls_limit, allow_f_increases, allow_outer_f_increases, successive_f_tol, Int(iterations), Int(outer_iterations), store_trace, trace_simplex, show_trace, extended_trace, show_warnings, Int(show_every), callback, Float64(time_limit)) end @@ -196,7 +198,7 @@ mutable struct MultivariateOptimizationResults{O, Tx, Tc, Tf, M, Tls, Tsb} <: Op f_abschange::Tc f_relchange::Tc g_converged::Bool - g_abstol::Tf + g_abstol::Tf # while this might be a vector, we store the component with largest violation g_residual::Tc f_increased::Bool trace::M diff --git a/src/utilities/assess_convergence.jl b/src/utilities/assess_convergence.jl index f6b8d7ce6..1609eea13 100644 --- a/src/utilities/assess_convergence.jl +++ b/src/utilities/assess_convergence.jl @@ -13,7 +13,27 @@ g_residual(d, state::NelderMeadState) = state.nm_x g_residual(d::AbstractObjective) = g_residual(gradient(d)) g_residual(d::NonDifferentiable) = convert(typeof(value(d)), NaN) g_residual(g) = maximum(abs, g) -gradient_convergence_assessment(state::AbstractOptimizerState, d, options) = g_residual(gradient(d)) ≤ options.g_abstol + +function g_converge_component(g_tol::AbstractVector, g) + absratio(num, denom) = (iszero(num) && iszero(denom)) ? zero(num/denom) : abs(num)/denom + + imax, maxratio = firstindex(g) - 1, typemin(first(g) / first(g_tol)) + for i in eachindex(g_tol, g) + ratio = absratio(g[i], g_tol[i]) + if ratio > maxratio + imax, maxratio = i, ratio + end + end + return g_tol[imax], abs(g[imax]) +end +g_converge_component(g_tol, g) = g_tol, g_residual(g) +g_converge_component(g_tol::AbstractVector, d, state) = g_converge_component(g_tol, gradient(d)) +g_converge_component(g_tol, d, state) = g_tol, g_residual(d, state) + +function gradient_convergence_assessment(state::AbstractOptimizerState, d, options) + g_tol, gnorm = g_converge_component(options.g_abstol, d, state) + return gnorm ≤ g_tol +end gradient_convergence_assessment(state::ZerothOrderState, d, options) = false # Default function for convergence assessment used by @@ -56,7 +76,8 @@ function assess_convergence(x, x_previous, f_x, f_x_previous, gx, x_abstol, x_re f_increased = true end - g_converged = g_residual(gx) ≤ g_abstol + g_converged = isa(g_abstol, Real) ? g_residual(gx) ≤ g_abstol : + all(t -> abs(t[1]) ≤ t[2], zip(gx, g_abstol)) return x_converged, f_converged, g_converged, f_increased end @@ -88,8 +109,10 @@ function assess_convergence(x, f_increased = true end - if g_residual(g) ≤ g_tol - g_converged = true + if isa(g_tol, Real) + g_converged = g_residual(g) ≤ g_tol + else + g_converged = all(t -> abs(t[1]) ≤ t[2], zip(g, g_tol)) end return x_converged, f_converged, g_converged, f_increased diff --git a/src/utilities/generic.jl b/src/utilities/generic.jl index fc677c524..b4fbd6287 100644 --- a/src/utilities/generic.jl +++ b/src/utilities/generic.jl @@ -15,3 +15,6 @@ end (similar(initial_x), # Buffer of x for line search in state.x_ls real(one(T))) # Keep track of step size in state.alpha end + +to_eltype(::Type{T}, x::Real) where T = T(x) +to_eltype(::Type{T}, x::AbstractArray) where T = convert(AbstractVector{T}, x) diff --git a/test/general/api.jl b/test/general/api.jl index 5fb82cac4..251bea5cc 100644 --- a/test/general/api.jl +++ b/test/general/api.jl @@ -102,6 +102,24 @@ initial_x, Newton(), options_g) + + options_gvec = Optim.Options(g_abstol = [1e-4, 1e-8], g_reltol = 0) + + for method in (BFGS(), LBFGS(), Newton()) + res = optimize(f, g!, h!, + initial_x, + method, + options_gvec) + @test Optim.g_converged(res) + @test all(abs.(Optim.gradient(d2, res.minimizer)) .≤ options_gvec.g_abstol) + str = sprint(show, res) + @test occursin(r"|g(x)|.*≤ 1.0e-04", str) || occursin(r"|g(x)|.*≤ 1.0e-08", str) + @test occursin(r"|x - x'| .*≰", str) + @test occursin(r"|x - x'|/|x'|.*≰", str) + @test occursin(r"|f(x) - f(x')| .*≰", str) + @test occursin(r"|f(x) - f(x')|/|f(x')|.*≰", str) + end + options_sa = Optim.Options(iterations = 10, store_trace = true, show_trace = false) res = optimize(f, g!, h!, diff --git a/test/general/convergence.jl b/test/general/convergence.jl index 5d0e5b20c..2ee1f2b25 100644 --- a/test/general/convergence.jl +++ b/test/general/convergence.jl @@ -37,6 +37,9 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end g_tol = 1e-6 g_abstol = 1e-6 @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, false) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, [g_tol]) == (true, true, true, false) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol/1000) == (true, true, false, false) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, [g_tol/1000]) == (true, true, false, false) # f_increase f0, f1 = 1.0, 1.0 + 1e-7 @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, true) @@ -46,8 +49,19 @@ mutable struct DummyMethodZeroth <: Optim.ZerothOrderOptimizer end @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, false, true, true) + # Implementation with both abstol and reltol + f_tol = 1e-6 + @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol, 0, f_tol, g_tol) == (true, true, true, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol, 0, f_tol, [g_tol]) == (true, true, true, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, 0, f_tol, 0, g_tol) == (true, true, true, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, 0, f_tol, 0, [g_tol]) == (true, true, true, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol/1000, 0, f_tol/1000, g_tol/1000) == (false, false, false, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, 0, x_tol/1000, 0, f_tol/1000, [g_tol/1000]) == (false, false, false, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol/1000, 0, f_tol/1000, 0, g_tol/1000) == (false, false, false, true) + @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol/1000, 0, f_tol/1000, 0, [g_tol/1000]) == (false, false, false, true) + f_tol = 1e-6 # rel tol - dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol) + dOpt = DummyOptions(x_tol, f_tol, g_tol, g_abstol) # FIXME: this isn't used? @test Optim.assess_convergence(x1, x0, f1, f0, g, x_tol, f_tol, g_tol) == (true, true, true, true) f0, f1 = 1.0, 1.0 - 1e-7 From 47f22774b4100c822a3a2c8e067793139b423a97 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Sun, 27 Oct 2024 13:41:51 -0500 Subject: [PATCH 2/2] Expand documentation on convergence options Fixes #1102 --- docs/src/user/config.md | 11 +++++++---- docs/src/user/minimization.md | 14 +++++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/src/user/config.md b/docs/src/user/config.md index 5126adaf8..9d67fa747 100644 --- a/docs/src/user/config.md +++ b/docs/src/user/config.md @@ -35,12 +35,15 @@ Special methods for bounded univariate optimization: * `Brent()` * `GoldenSection()` -### General Options +### [General Options](@id config-general) + In addition to the solver, you can alter the behavior of the Optim package by using the following keywords: -* `x_tol`: Absolute tolerance in changes of the input vector `x`, in infinity norm. Defaults to `0.0`. -* `f_tol`: Relative tolerance in changes of the objective value. Defaults to `0.0`. -* `g_tol`: Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner. +* `x_tol` (alternatively, `x_abstol`): Absolute tolerance in changes of the input vector `x`, in infinity norm. Concretely, if `|x-x'| ≤ x_tol` on successive evaluation points `x` and `x'`, convergence is achieved. Defaults to `0.0`. +* `x_reltol`: Relative tolerance in changes of the input vector `x`, in infinity norm. Concretely, if `|x-x'| ≤ x_reltol * |x|`, convergence is achieved. Defaults to `0.0` +* `f_tol` (alternatively, `f_reltol`): Relative tolerance in changes of the objective value. Defaults to `0.0`. +* `f_abstol`: Absolute tolerance in changes of the objective value. Defaults to `0.0`. +* `g_tol` (alternatively, `g_abstol`): Absolute tolerance in the gradient. If `g_tol` is a scalar (the default), convergence is achieved when `norm(g, Inf) ≤ g_tol`; if `g_tol` is supplied as a vector, then each component must satisfy `abs(g[i]) ≤ g_tol[i]`. Defaults to `1e-8`. For gradient-free methods (e.g., Nelder-Meade), this gets re-purposed to control the main convergence tolerance in a solver-specific manner. * `f_calls_limit`: A soft upper limit on the number of objective calls. Defaults to `0` (unlimited). * `g_calls_limit`: A soft upper limit on the number of gradient calls. Defaults to `0` (unlimited). * `h_calls_limit`: A soft upper limit on the number of Hessian calls. Defaults to `0` (unlimited). diff --git a/docs/src/user/minimization.md b/docs/src/user/minimization.md index 238ad591e..1a55a5886 100644 --- a/docs/src/user/minimization.md +++ b/docs/src/user/minimization.md @@ -31,6 +31,12 @@ For better performance and greater precision, you can pass your own gradient fun optimize(f, x0, LBFGS(); autodiff = :forward) ``` +!!! note + For most real-world problems, you may want to carefully consider the appropriate convergence criteria. + By default, algorithms that support gradients converge if `|g| ≤ 1e-8`. Depending on how your variables are scaled, + this may or may not be appropriate. See [configuration](@ref config-general) for more information about your options. + Examining traces (`Options(show_trace=true)`) during optimization may provide insight about when convergence is achieved in practice. + For the Rosenbrock example, the analytical gradient can be shown to be: ```jl function g!(G, x) @@ -65,7 +71,7 @@ Now we can use Newton's method for optimization by running: ```jl optimize(f, g!, h!, x0) ``` -Which defaults to `Newton()` since a Hessian function was provided. Like gradients, the Hessian function will be ignored if you use a method that does not require it: +which defaults to `Newton()` since a Hessian function was provided. Like gradients, the Hessian function will be ignored if you use a method that does not require it: ```jl optimize(f, g!, h!, x0, LBFGS()) ``` @@ -74,6 +80,12 @@ because of the potentially low accuracy of approximations to the Hessians. Other than Newton's method, none of the algorithms provided by the Optim package employ exact Hessians. +As a reminder, it's advised to set your convergence criteria manually based on +your knowledge of the problem: +``` +optimize(f, g!, h!, x0, Optim.Options(g_tol = 1e-12)) +``` + ## Box Constrained Optimization A primal interior-point algorithm for simple "box" constraints (lower and upper bounds) is available. Reusing our Rosenbrock example from above, boxed minimization is performed as follows: