Source code for botorch.optim.batched_lbfgs_b

# noqa
# flake8: noqa
# @lint-ignore-every LICENSELINT
"""
This is a port of the L-BFGS-B implementation from SciPy s.t. it supports batched
evaluations. That is, the objective function's output value (and its gradient)
can be evaluated at a batch of points at once.
This yields optimization speedups for acquisition function optimization,
where multiple independent problems with the same structure are optimized in parallel.

This file is written such that it explicitly supports all scipy versions
from 1.13 to 1.15 (likely 1.16, too, based on its pre-release version).
This file might break for higher versions, as it uses internal APIs.
There is a major revision of the core optimization code in 1.15, as it is
ported from FORTRAN to C, we handle the API changes, though, and are
compatible with both.
"""

# License for the Python wrapper
# ==============================

# Heavily modified to allow batched optimization by Samuel Müller (2025) <sammuller@meta.com>

# Copyright (c) 2004 David M. Cooke <cookedm@physics.mcmaster.ca>

# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

# Modifications by Travis Oliphant and Enthought, Inc. for inclusion in SciPy

import typing as tp

import numpy as np
from numpy import array, asarray, zeros
from scipy.optimize import _lbfgsb
from scipy.optimize._constraints import old_bound_to_new
from scipy.optimize._lbfgsb_py import LbfgsInvHessProduct
from scipy.optimize._optimize import (
    _call_callback_maybe_halt,
    _check_unknown_options,
    _wrap_callback,
    OptimizeResult,
)

from .utils import check_scipy_version_at_least


__all__ = ["fmin_l_bfgs_b_batched"]


status_messages = {
    0: "START",
    1: "NEW_X",
    2: "RESTART",
    3: "FG",
    4: "CONVERGENCE",
    5: "STOP",
    6: "WARNING",
    7: "ERROR",
    8: "ABNORMAL",
}


task_messages = {
    0: "",
    301: "",
    302: "",
    401: "NORM OF PROJECTED GRADIENT <= PGTOL",
    402: "RELATIVE REDUCTION OF F <= FACTR*EPSMCH",
    501: "CPU EXCEEDING THE TIME LIMIT",
    502: "TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT",
    503: "PROJECTED GRADIENT IS SUFFICIENTLY SMALL",
    504: "TOTAL NO. OF ITERATIONS REACHED LIMIT",
    505: "CALLBACK REQUESTED HALT",
    601: "ROUNDING ERRORS PREVENT PROGRESS",
    602: "STP = STPMAX",
    603: "STP = STPMIN",
    604: "XTOL TEST SATISFIED",
    701: "NO FEASIBLE SOLUTION",
    702: "FACTR < 0",
    703: "FTOL < 0",
    704: "GTOL < 0",
    705: "XTOL < 0",
    706: "STP < STPMIN",
    707: "STP > STPMAX",
    708: "STPMIN < 0",
    709: "STPMAX < STPMIN",
    710: "INITIAL G >= 0",
    711: "M <= 0",
    712: "N <= 0",
    713: "INVALID NBD",
}

uses_c_implementation = check_scipy_version_at_least(minor=15)


class OptimState:
    def __init__(
        self,
        bounds: list[tuple[float]] | None,
        maxls: int,
        x0: np.ndarray,
        n: int,
        m: int,
    ):
        standard_int = np.int32 if uses_c_implementation else _lbfgsb.types.intvar.dtype
        self.nbd = zeros(n, standard_int)
        self.low_bnd = zeros(n, np.float64)
        self.upper_bnd = zeros(n, np.float64)
        self.bounds_map = {
            (-np.inf, np.inf): 0,
            (1, np.inf): 1,
            (1, 1): 2,
            (-np.inf, 1): 3,
        }

        self.x = array(x0, np.float64)
        self.f = array(0.0, np.int32 if uses_c_implementation else np.float64)
        self.g = zeros((n,), np.int32 if uses_c_implementation else np.float64)
        self.wa = zeros(2 * m * n + 5 * n + 11 * m * m + 8 * m, np.float64)
        self.iwa = zeros(3 * n, standard_int)
        self.task = (
            zeros(2, dtype=np.int32) if uses_c_implementation else zeros(1, "S60")
        )
        self.csave = zeros(1, "S60")  # only used for fortran implementation
        self.ln_task = zeros(2, dtype=np.int32)  # only used for c implementation
        self.lsave = zeros(4, standard_int)
        self.isave = zeros(44, standard_int)
        self.dsave = zeros(29, np.float64)

        self.state_str = None

        if not uses_c_implementation:  # pragma: no cover
            self.task[:] = "START"

        self.n_iterations = 0
        self.fun_calls = 0

        if bounds is not None:
            for i in range(0, n):
                l, u = bounds[:, i]
                if not np.isinf(l):
                    self.low_bnd[i] = l
                    l = 1
                if not np.isinf(u):
                    self.upper_bnd[i] = u
                    u = 1
                self.nbd[i] = self.bounds_map[l, u]

        if not maxls > 0:
            raise ValueError("maxls must be positive.")


[docs] def fmin_l_bfgs_b_batched( func, x0, bounds=None, maxcor=10, factr=1e7, ftol=None, pgtol=1e-5, tol=None, maxiter=15000, disp=None, callback=None, maxls=20, pass_batch_indices=False, ): """ Minimize multiple inputs to a batched function ``func`` using the L-BFGS-B algorithm. We minimize multiple inputs to the function at once (by providing a 2d array of shape [b, n]). We assume that the function ``func`` is batched, i.e. it will return a 1d array of shape [b,] of independent function values, when passed a 2d array of shape [b, n]. Parameters ---------- func : callable f(x,*args) Function to minimize. x0 : ndarray Initial guess of shape [b, n]. bounds : list, optional ``(min, max)`` pairs for each element in ``x``, defining the bounds on that parameter. Use None or +-inf for one of ``min`` or ``max`` when there is no bound in that direction. maxcor : int, optional The maximum number of variable metric corrections used to define the limited memory matrix. (The limited memory BFGS method does not store the full hessian but uses this many terms in an approximation to it.) factr : float, optional The iteration stops when ``(f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= factr * eps``, where ``eps`` is the machine precision, which is automatically generated by the code. Typical values for ``factr`` are: 1e12 for low accuracy; 1e7 for moderate accuracy; 10.0 for extremely high accuracy. See Notes for relationship to ``ftol``, which is exposed (instead of ``factr``) by the ``scipy.optimize.minimize`` interface to L-BFGS-B. ftol: float, optional Set ftol directly, meaning the iteration stops when ``(f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= ftol`` pgtol : float, optional The iteration will stop when ``max{|proj g_i | i = 1, ..., n} <= pgtol`` where ``proj g_i`` is the i-th component of the projected gradient. tol : float, optional An alias for ``pgtol`` to be compatible with the ``scipy.optimize.minimize``. maxiter : int, optional Maximum number of iterations. disp: int, optional This is depcreated and only here for backwards compatibility. callback : callable, optional Called after each iteration for each batch item, as ``callback(xk)``, where ``xk`` is the current parameter vector. maxls : int, optional Maximum number of line search steps (per iteration). Default is 20. pass_batch_indices : bool If True, fun is called with an additional kwargs ``batch_indices``, which is a list that is as long as the current batch is wide, and indexes into the original batch specified via ``x0``. Returns ------- x : array_like Estimated position of the minimum. f : float Value of ``func`` at the minimum. d : dict Information dictionary. * d['warnflag'] is - 0 if converged, - 1 if too many function evaluations or too many iterations, - 2 if stopped for another reason, given in d['task'] * d['grad'] is the gradient at the minimum (should be 0 ish) * d['funcalls'] is the number of function calls made. * d['nit'] is the number of iterations. See also -------- minimize: Interface to minimization algorithms for multivariate functions. See the 'L-BFGS-B' ``method`` in particular. Note that the ``ftol`` option is made available via that interface, while ``factr`` is provided via this interface, where ``factr`` is the factor multiplying the default machine floating-point precision to arrive at ``ftol``: ``ftol = factr * numpy.finfo(float).eps``. Notes ----- License of L-BFGS-B (FORTRAN code): The version included here (in fortran code) is 3.0 (released April 25, 2011). It was written by Ciyou Zhu, Richard Byrd, and Jorge Nocedal <nocedal@ece.nwu.edu>. It carries the following condition for use: This software is freely available, but we expect that all publications describing work using this software, or all commercial products using it, quote at least one of the references given below. This software is released under the BSD License. SciPy uses a C-translated and modified version of the Fortran code, L-BFGS-B v3.0 (released April 25, 2011, BSD-3 licensed). Original Fortran version was written by Ciyou Zhu, Richard Byrd, Jorge Nocedal and, Jose Luis Morales. References ---------- * R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound Constrained Optimization, (1995), SIAM Journal on Scientific and Statistical Computing, 16, 5, pp. 1190-1208. * C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (1997), ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560. * J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B, FORTRAN routines for large scale bound constrained optimization (2011), ACM Transactions on Mathematical Software, 38, 1. Examples -------- Solve a linear regression problem via ``fmin_l_bfgs_b``. To do this, first we define an objective function ``f(m, b) = (y - y_model)**2``, where ``y`` describes the observations and ``y_model`` the prediction of the linear model as ``y_model = m*x + b``. The bounds for the parameters, ``m`` and ``b``, are arbitrarily chosen as ``(0,5)`` and ``(5,10)`` for this example. >>> import numpy as np >>> from scipy.optimize import fmin_l_bfgs_b >>> X = np.arange(0, 10, 1) >>> M = 2 >>> B = 3 >>> Y = M * X + B >>> def func(parameters, *args): ... x = args[0] ... y = args[1] ... m, b = parameters ... y_model = m*x + b ... error = sum(np.power((y - y_model), 2)) ... return error >>> initial_values = np.array([0.0, 1.0]) >>> x_opt, f_opt, info = fmin_l_bfgs_b(func, x0=initial_values, args=(X, Y), ... approx_grad=True) >>> x_opt, f_opt array([1.99999999, 3.00000006]), 1.7746231151323805e-14 # may vary The optimized parameters in ``x_opt`` agree with the ground truth parameters ``m`` and ``b``. Next, let us perform a bound contrained optimization using the ``bounds`` parameter. >>> bounds = [(0, 5), (5, 10)] >>> x_opt, f_op, info = fmin_l_bfgs_b(func, x0=initial_values, args=(X, Y), ... approx_grad=True, bounds=bounds) >>> x_opt, f_opt array([1.65990508, 5.31649385]), 15.721334516453945 # may vary """ if disp is not None: # pragma: no cover print("The option `disp` is deprecated and will be removed in a future.") if ftol is None: ftol = factr * np.finfo(float).eps else: assert factr is None, ( "ftol and factr cannot be used together, set factr explicitly to None." ) # build options callback = _wrap_callback(callback) opts = { "maxcor": maxcor, "ftol": ftol, "gtol": tol if tol is not None else pgtol, "maxiter": maxiter, "callback": callback, "maxls": maxls, "pass_batch_indices": pass_batch_indices, } results = _minimize_lbfgsb(func, x0, bounds=bounds, **opts) fs = [res["fun"] for res in results] xs = [res["x"] for res in results] return np.stack(xs), np.stack(fs), results
def _minimize_lbfgsb( fun: tp.Callable[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]], x0, bounds=None, maxcor=10, ftol=2.2204460492503131e-09, gtol=1e-5, maxiter=15000, callback=None, maxls=20, pass_batch_indices=False, **unknown_options, ): """ Minimize a scalar function of one or more variables using the L-BFGS-B algorithm. Options ------- fun: callable[np.ndarray] -> tuple[np.ndarray, np.ndarray] accepts a batch of inputs [b,d] and returns a batch of outputs [b] and gradients [b,d] as a tuple bounds: list[tuple[float, float]], optional ``(min, max)`` pairs for each element in ``x``, defining the bounds on that parameter. Use None or +-inf for one of ``min`` or ``max`` when there is no bound in that direction. maxcor : int The maximum number of variable metric corrections used to define the limited memory matrix. (The limited memory BFGS method does not store the full hessian but uses this many terms in an approximation to it.) ftol : float The iteration stops when ``(f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= ftol``. The default is set to be a classical machine epsilon gtol : float The iteration will stop when ``max{|proj g_i | i = 1, ..., n} <= gtol`` where ``proj g_i`` is the i-th component of the projected gradient. maxiter : int Maximum number of iterations. callback : callable, optional A "wrapped" callback function called after each iteration for each batch item, as ``callback(xk)``, where ``xk`` is the current parameter vector. It is called on the original xk, thus it should not change xk inplace. It can halt the algorithm by raising a StopIteration exception. maxls : int, optional Maximum number of line search steps (per iteration). Default is 20. pass_batch_indices : bool If True, fun is called with an additional kwargs ``batch_indices``, which is a list that is as long as the current batch is wide, and indexes into the original batch specified via ``x0``. Notes ----- The option ``ftol`` is exposed via the ``scipy.optimize.minimize`` interface, but calling ``scipy.optimize.fmin_l_bfgs_b`` directly exposes ``factr``. The relationship between the two is ``ftol = factr * numpy.finfo(float).eps``. I.e., ``factr`` multiplies the default machine floating-point precision to arrive at ``ftol``. """ _check_unknown_options(unknown_options) m = maxcor pgtol = gtol factr = ftol / np.finfo(float).eps x0s = asarray(x0) assert x0s.ndim == 2, "x0 must be a 2-D array" (b, n) = x0s.shape # historically old-style bounds were/are expected by lbfgsb. # That's still the case but we'll deal with new-style from here on, # it's easier if bounds is None: pass elif len(bounds) != n: raise ValueError(f"length of x0 != length of bounds, {n} != len({bounds})") else: bounds = np.array(old_bound_to_new(bounds)) # check bounds if (bounds[0] > bounds[1]).any(): raise ValueError( "LBFGSB - one of the lower bounds is greater than an upper bound." ) # initial vector must lie within the bounds. Otherwise ScalarFunction and # approx_derivative will cause problems x0s = np.clip(x0s, bounds[0], bounds[1]) # _prepare_scalar_function can use bounds=None to represent no bounds func_and_grad = fun states = [OptimState(bounds, maxls, x0, n, m) for x0 in x0s] dones = np.zeros(b, bool) do_forward = np.zeros(b, bool) while 1: # prep for i in range(b): while ( ~dones[i] & ~do_forward[i] ): # setulb sometimes needs to be called multiple times # until it needs new info or is done state = states[i] # g may become float32 if a user provides a function that calculates # the Jacobian in float32 (see gh-18730). The underlying Fortran/C code # expects float64, so upcast it state.g = state.g.astype(np.float64) # x, f, g, wa, iwa, task, csave, lsave, isave, dsave = \ if uses_c_implementation: _lbfgsb.setulb( m, state.x, state.low_bnd, state.upper_bnd, state.nbd, state.f, state.g, factr, pgtol, state.wa, state.iwa, state.task, state.lsave, state.isave, state.dsave, maxls, state.ln_task, ) else: # pragma: no cover _lbfgsb.setulb( m, state.x, state.low_bnd, state.upper_bnd, state.nbd, state.f, state.g, factr, pgtol, state.wa, state.iwa, state.task, -1, # iprint, default is -1 (not supported by the C impl) state.csave, state.lsave, state.isave, state.dsave, maxls, ) if not uses_c_implementation: # pragma: no cover task_str = state.task.tobytes() if ( state.task[0] == 3 if uses_c_implementation else task_str.startswith(b"FG") ): # The minimization routine wants f and g at the current x. # Note that interruptions due to maxfun are postponed # until the completion of the current minimization iteration. # Overwrite f and g: # state.f, state.g = func_and_grad( # state.x # ) # todo potentially use [:] assignment? do_forward[i] = True elif ( state.task[0] == 1 if uses_c_implementation else task_str.startswith(b"NEW_X") ): # new iteration state.n_iterations += 1 intermediate_result = OptimizeResult(x=state.x, fun=state.f) if _call_callback_maybe_halt(callback, intermediate_result): if uses_c_implementation: state.task[0] = 5 state.task[1] = 505 else: # pragma: no cover state.task[:] = "STOP: CALLBACK REQUESTED HALT" if state.n_iterations >= maxiter: if uses_c_implementation: state.task[0] = 5 state.task[1] = 504 else: # pragma: no cover state.task[:] = ( "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT" ) else: dones[i] = True if np.any(do_forward): # only the do_forward stuff is worked on total_x = np.stack( [state.x for state, do_fw in zip(states, do_forward) if do_fw] ) if pass_batch_indices: batch_indices = [i for i, do_fw in enumerate(do_forward) if do_fw] total_f, total_g = func_and_grad(total_x, batch_indices=batch_indices) else: total_f, total_g = func_and_grad(total_x) for func_i, i in enumerate( do_forward.nonzero()[0] ): # taking the 0 as we are interested in the first (and only) dim states[i].f = total_f[func_i] states[i].g = total_g[func_i] states[i].fun_calls += 1 do_forward[:] = False if np.all(dones): break results = [] for state in states: if not uses_c_implementation: # pragma: no cover task_str = state.task.tobytes().strip(b"\x00").strip() if ( state.task[0] == 4 if uses_c_implementation else task_str.startswith(b"CONV") ): warnflag = 0 elif state.n_iterations >= maxiter: warnflag = 1 else: warnflag = 2 # These two portions of the workspace are described in the mainlb # subroutine in lbfgsb.f (See line 363), if you are on an older # scipy version (< 1.14) and in the function docstring in "__lbfgsb.c", # ws and wy arguments, otherwise. s = state.wa[0 : m * n].reshape(m, n) y = state.wa[m * n : 2 * m * n].reshape(m, n) # See lbfgsb.f line 160 for this portion of the workspace. # isave(31) = the total number of BFGS updates prior the current iteration; n_bfgs_updates = state.isave[30] n_corrs = min(n_bfgs_updates, maxcor) hess_inv = LbfgsInvHessProduct(s[:n_corrs], y[:n_corrs]) if uses_c_implementation: msg = status_messages[state.task[0]] + ": " + task_messages[state.task[1]] else: # pragma: no cover msg = task_str.decode() results.append( OptimizeResult( fun=state.f, jac=state.g, nfev=state.fun_calls, njev=None, nit=state.n_iterations, status=warnflag, message=msg, x=state.x, success=(warnflag == 0), hess_inv=hess_inv, ) ) return results # extra helper function def translate_bounds_for_lbfgsb( lower_bounds: tp.Sequence[float | None] | float | None, upper_bounds: tp.Sequence[float | None] | float | None, num_features: int, q: int, ): """ Translates the bounds to the format expected by L-BFGS-B. Parameters ---------- lower_bounds : tensor(n,) or float or None Lower bounds for the parameters. If None, then the lower bounds are unbounded. If float is provided, then that value is used as the lower bound for all parameters. upper_bounds : tensor(n,) or None Upper bounds for the parameters. If None, then the upper bounds are unbounded. If float is provided, then that value is used as the lower bound for all parameters. num_features : int Number of features in the model. q : int Number of repetitions to be optimized jointly in each item. fixed_features : dict, optional Dictionary mapping indices to the values that they are fixed to. These indices will not be included in the returned bounds. """ bounds = [lower_bounds, upper_bounds] for i in range(2): if bounds[i] is None: bounds[i] = num_features * [None] elif not isinstance(bounds[i], tp.Iterable): bounds[i] = num_features * [bounds[i]] else: bounds[i] = list(bounds[i]) if len(bounds[i]) == num_features: bounds[i] = sum([bounds[i] for _ in range(q)], []) assert len(bounds[i]) == num_features * q, ( f"Instead got {len(bounds[i])} != {num_features} * {q}." ) return list(zip(*bounds))