Source code for botorch.optim.batched_lbfgs_b

# noqa
# flake8: noqa
# @lint-ignore-every LICENSELINT
"""
This is a port of the L-BFGS-B implementation from SciPy s.t. it supports batched
evaluations. That is, the objective function's output value (and its gradient)
can be evaluated at a batch of points at once.
This yields optimization speedups for acquisition function optimization,
where multiple independent problems with the same structure are optimized in parallel.

This file is written such that it explicitly supports all scipy versions
from 1.13 to 1.15 (likely 1.16, too, based on its pre-release version).
This file might break for higher versions, as it uses internal APIs.
There is a major revision of the core optimization code in 1.15, as it is
ported from FORTRAN to C, we handle the API changes, though, and are
compatible with both.
"""

# License for the Python wrapper
# ==============================

# Heavily modified to allow batched optimization by Samuel Müller (2025) <sammuller@meta.com>

# Copyright (c) 2004 David M. Cooke <cookedm@physics.mcmaster.ca>

# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

# Modifications by Travis Oliphant and Enthought, Inc. for inclusion in SciPy

import typing as tp

import numpy as np
from numpy import array, asarray, zeros
from scipy.optimize import _lbfgsb
from scipy.optimize._constraints import old_bound_to_new
from scipy.optimize._lbfgsb_py import LbfgsInvHessProduct
from scipy.optimize._optimize import (
    _call_callback_maybe_halt,
    _check_unknown_options,
    _wrap_callback,
    OptimizeResult,
)

from .utils import check_scipy_version_at_least


__all__ = ["fmin_l_bfgs_b_batched"]


status_messages = {
    0: "START",
    1: "NEW_X",
    2: "RESTART",
    3: "FG",
    4: "CONVERGENCE",
    5: "STOP",
    6: "WARNING",
    7: "ERROR",
    8: "ABNORMAL",
}


task_messages = {
    0: "",
    301: "",
    302: "",
    401: "NORM OF PROJECTED GRADIENT <= PGTOL",
    402: "RELATIVE REDUCTION OF F <= FACTR*EPSMCH",
    501: "CPU EXCEEDING THE TIME LIMIT",
    502: "TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT",
    503: "PROJECTED GRADIENT IS SUFFICIENTLY SMALL",
    504: "TOTAL NO. OF ITERATIONS REACHED LIMIT",
    505: "CALLBACK REQUESTED HALT",
    601: "ROUNDING ERRORS PREVENT PROGRESS",
    602: "STP = STPMAX",
    603: "STP = STPMIN",
    604: "XTOL TEST SATISFIED",
    701: "NO FEASIBLE SOLUTION",
    702: "FACTR < 0",
    703: "FTOL < 0",
    704: "GTOL < 0",
    705: "XTOL < 0",
    706: "STP < STPMIN",
    707: "STP > STPMAX",
    708: "STPMIN < 0",
    709: "STPMAX < STPMIN",
    710: "INITIAL G >= 0",
    711: "M <= 0",
    712: "N <= 0",
    713: "INVALID NBD",
}

uses_c_implementation = check_scipy_version_at_least(minor=15)


class OptimState:
    def __init__(
        self,
        bounds: list[tuple[float]] | None,
        maxls: int,
        x0: np.ndarray,
        n: int,
        m: int,
    ):
        standard_int = np.int32 if uses_c_implementation else _lbfgsb.types.intvar.dtype
        self.nbd = zeros(n, standard_int)
        self.low_bnd = zeros(n, np.float64)
        self.upper_bnd = zeros(n, np.float64)
        self.bounds_map = {
            (-np.inf, np.inf): 0,
            (1, np.inf): 1,
            (1, 1): 2,
            (-np.inf, 1): 3,
        }

        self.x = array(x0, np.float64)
        self.f = array(0.0, np.int32 if uses_c_implementation else np.float64)
        self.g = zeros((n,), np.int32 if uses_c_implementation else np.float64)
        self.wa = zeros(2 * m * n + 5 * n + 11 * m * m + 8 * m, np.float64)
        self.iwa = zeros(3 * n, standard_int)
        self.task = (
            zeros(2, dtype=np.int32) if uses_c_implementation else zeros(1, "S60")
        )
        self.csave = zeros(1, "S60")  # only used for fortran implementation
        self.ln_task = zeros(2, dtype=np.int32)  # only used for c implementation
        self.lsave = zeros(4, standard_int)
        self.isave = zeros(44, standard_int)
        self.dsave = zeros(29, np.float64)

        self.state_str = None

        if not uses_c_implementation:  # pragma: no cover
            self.task[:] = "START"

        self.n_iterations = 0
        self.fun_calls = 0

        if bounds is not None:
            for i in range(0, n):
                l, u = bounds[:, i]
                if not np.isinf(l):
                    self.low_bnd[i] = l
                    l = 1
                if not np.isinf(u):
                    self.upper_bnd[i] = u
                    u = 1
                self.nbd[i] = self.bounds_map[l, u]

        if not maxls > 0:
            raise ValueError("maxls must be positive.")



[docs]
def fmin_l_bfgs_b_batched(
    func,
    x0,
    bounds=None,
    maxcor=10,
    factr=1e7,
    ftol=None,
    pgtol=1e-5,
    tol=None,
    maxiter=15000,
    disp=None,
    callback=None,
    maxls=20,
    pass_batch_indices=False,
):
    """
    Minimize multiple inputs to a batched function ``func`` using the L-BFGS-B algorithm.
    We minimize multiple inputs to the function at once (by providing a 2d array
    of shape [b, n]).
    We assume that the function ``func`` is batched, i.e. it will return a 1d array
    of shape [b,] of independent function values, when passed a 2d array of
    shape [b, n].

    Parameters
    ----------
    func : callable f(x,*args)
        Function to minimize.
    x0 : ndarray
        Initial guess of shape [b, n].
    bounds : list, optional
        ``(min, max)`` pairs for each element in ``x``, defining
        the bounds on that parameter. Use None or +-inf for one of ``min`` or
        ``max`` when there is no bound in that direction.
    maxcor : int, optional
        The maximum number of variable metric corrections
        used to define the limited memory matrix. (The limited memory BFGS
        method does not store the full hessian but uses this many terms in an
        approximation to it.)
    factr : float, optional
        The iteration stops when
        ``(f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= factr * eps``,
        where ``eps`` is the machine precision, which is automatically
        generated by the code. Typical values for ``factr`` are: 1e12 for
        low accuracy; 1e7 for moderate accuracy; 10.0 for extremely
        high accuracy. See Notes for relationship to ``ftol``, which is exposed
        (instead of ``factr``) by the ``scipy.optimize.minimize`` interface to
        L-BFGS-B.
    ftol: float, optional
        Set ftol directly, meaning the iteration stops when
        ``(f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= ftol``
    pgtol : float, optional
        The iteration will stop when
        ``max{|proj g_i | i = 1, ..., n} <= pgtol``
        where ``proj g_i`` is the i-th component of the projected gradient.
    tol : float, optional
        An alias for ``pgtol`` to be compatible with the ``scipy.optimize.minimize``.
    maxiter : int, optional
        Maximum number of iterations.
    disp: int, optional
        This is depcreated and only here for backwards compatibility.
    callback : callable, optional
        Called after each iteration for each batch item, as ``callback(xk)``,
        where ``xk`` is the current parameter vector.
    maxls : int, optional
        Maximum number of line search steps (per iteration). Default is 20.
    pass_batch_indices : bool
        If True, fun is called with an additional kwargs ``batch_indices``,
        which is a list that is as long as the current batch is wide,
        and indexes into the original batch specified via ``x0``.

    Returns
    -------
    x : array_like
        Estimated position of the minimum.
    f : float
        Value of ``func`` at the minimum.
    d : dict
        Information dictionary.

        * d['warnflag'] is

          - 0 if converged,
          - 1 if too many function evaluations or too many iterations,
          - 2 if stopped for another reason, given in d['task']

        * d['grad'] is the gradient at the minimum (should be 0 ish)
        * d['funcalls'] is the number of function calls made.
        * d['nit'] is the number of iterations.

    See also
    --------
    minimize: Interface to minimization algorithms for multivariate
        functions. See the 'L-BFGS-B' ``method`` in particular. Note that the
        ``ftol`` option is made available via that interface, while ``factr`` is
        provided via this interface, where ``factr`` is the factor multiplying
        the default machine floating-point precision to arrive at ``ftol``:
        ``ftol = factr * numpy.finfo(float).eps``.

    Notes
    -----
    License of L-BFGS-B (FORTRAN code):

    The version included here (in fortran code) is 3.0
    (released April 25, 2011). It was written by Ciyou Zhu, Richard Byrd,
    and Jorge Nocedal <nocedal@ece.nwu.edu>. It carries the following
    condition for use:

    This software is freely available, but we expect that all publications
    describing work using this software, or all commercial products using it,
    quote at least one of the references given below. This software is released
    under the BSD License.

    SciPy uses a C-translated and modified version of the Fortran code,
    L-BFGS-B v3.0 (released April 25, 2011, BSD-3 licensed). Original Fortran
    version was written by Ciyou Zhu, Richard Byrd, Jorge Nocedal and,
    Jose Luis Morales.

    References
    ----------
    * R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound
      Constrained Optimization, (1995), SIAM Journal on Scientific and
      Statistical Computing, 16, 5, pp. 1190-1208.
    * C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B,
      FORTRAN routines for large scale bound constrained optimization (1997),
      ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560.
    * J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B,
      FORTRAN routines for large scale bound constrained optimization (2011),
      ACM Transactions on Mathematical Software, 38, 1.

    Examples
    --------
    Solve a linear regression problem via ``fmin_l_bfgs_b``. To do this, first we define
    an objective function ``f(m, b) = (y - y_model)**2``, where ``y`` describes the
    observations and ``y_model`` the prediction of the linear model as
    ``y_model = m*x + b``. The bounds for the parameters, ``m`` and ``b``, are
    arbitrarily chosen as ``(0,5)`` and ``(5,10)`` for this example.

    >>> import numpy as np
    >>> from scipy.optimize import fmin_l_bfgs_b
    >>> X = np.arange(0, 10, 1)
    >>> M = 2
    >>> B = 3
    >>> Y = M * X + B
    >>> def func(parameters, *args):
    ...     x = args[0]
    ...     y = args[1]
    ...     m, b = parameters
    ...     y_model = m*x + b
    ...     error = sum(np.power((y - y_model), 2))
    ...     return error

    >>> initial_values = np.array([0.0, 1.0])

    >>> x_opt, f_opt, info = fmin_l_bfgs_b(func, x0=initial_values, args=(X, Y),
    ...                                    approx_grad=True)
    >>> x_opt, f_opt
    array([1.99999999, 3.00000006]), 1.7746231151323805e-14  # may vary

    The optimized parameters in ``x_opt`` agree with the ground truth parameters
    ``m`` and ``b``. Next, let us perform a bound contrained optimization using
    the ``bounds`` parameter.

    >>> bounds = [(0, 5), (5, 10)]
    >>> x_opt, f_op, info = fmin_l_bfgs_b(func, x0=initial_values, args=(X, Y),
    ...                                   approx_grad=True, bounds=bounds)
    >>> x_opt, f_opt
    array([1.65990508, 5.31649385]), 15.721334516453945  # may vary
    """
    if disp is not None:  # pragma: no cover
        print("The option `disp` is deprecated and will be removed in a future.")
    if ftol is None:
        ftol = factr * np.finfo(float).eps
    else:
        assert factr is None, (
            "ftol and factr cannot be used together, set factr explicitly to None."
        )

    # build options
    callback = _wrap_callback(callback)
    opts = {
        "maxcor": maxcor,
        "ftol": ftol,
        "gtol": tol if tol is not None else pgtol,
        "maxiter": maxiter,
        "callback": callback,
        "maxls": maxls,
        "pass_batch_indices": pass_batch_indices,
    }

    results = _minimize_lbfgsb(func, x0, bounds=bounds, **opts)
    fs = [res["fun"] for res in results]
    xs = [res["x"] for res in results]

    return np.stack(xs), np.stack(fs), results



def _minimize_lbfgsb(
    fun: tp.Callable[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]],
    x0,
    bounds=None,
    maxcor=10,
    ftol=2.2204460492503131e-09,
    gtol=1e-5,
    maxiter=15000,
    callback=None,
    maxls=20,
    pass_batch_indices=False,
    **unknown_options,
):
    """
    Minimize a scalar function of one or more variables using the L-BFGS-B
    algorithm.

    Options
    -------
    fun: callable[np.ndarray] -> tuple[np.ndarray, np.ndarray]
        accepts a batch of inputs [b,d] and returns a batch of outputs [b]
        and gradients [b,d] as a tuple
    bounds: list[tuple[float, float]], optional
        ``(min, max)`` pairs for each element in ``x``, defining
        the bounds on that parameter. Use None or +-inf for one of ``min`` or
        ``max`` when there is no bound in that direction.
    maxcor : int
        The maximum number of variable metric corrections used to
        define the limited memory matrix. (The limited memory BFGS
        method does not store the full hessian but uses this many terms
        in an approximation to it.)
    ftol : float
        The iteration stops when ``(f^k -
        f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= ftol``.
        The default is set to be a classical machine epsilon
    gtol : float
        The iteration will stop when ``max{|proj g_i | i = 1, ..., n}
        <= gtol`` where ``proj g_i`` is the i-th component of the
        projected gradient.
    maxiter : int
        Maximum number of iterations.
    callback : callable, optional
        A "wrapped" callback function called after each iteration
        for each batch item, as ``callback(xk)``,
        where ``xk`` is the current parameter vector. It is called on the
        original xk, thus it should not change xk inplace.
        It can halt the algorithm by raising a StopIteration exception.
    maxls : int, optional
        Maximum number of line search steps (per iteration). Default is 20.
    pass_batch_indices : bool
        If True, fun is called with an additional kwargs ``batch_indices``,
        which is a list that is as long as the current batch is wide,
        and indexes into the original batch specified via ``x0``.

    Notes
    -----
    The option ``ftol`` is exposed via the ``scipy.optimize.minimize`` interface,
    but calling ``scipy.optimize.fmin_l_bfgs_b`` directly exposes ``factr``. The
    relationship between the two is ``ftol = factr * numpy.finfo(float).eps``.
    I.e., ``factr`` multiplies the default machine floating-point precision to
    arrive at ``ftol``.

    """

    _check_unknown_options(unknown_options)
    m = maxcor
    pgtol = gtol
    factr = ftol / np.finfo(float).eps

    x0s = asarray(x0)
    assert x0s.ndim == 2, "x0 must be a 2-D array"
    (b, n) = x0s.shape

    # historically old-style bounds were/are expected by lbfgsb.
    # That's still the case but we'll deal with new-style from here on,
    # it's easier
    if bounds is None:
        pass
    elif len(bounds) != n:
        raise ValueError(f"length of x0 != length of bounds, {n} != len({bounds})")
    else:
        bounds = np.array(old_bound_to_new(bounds))

        # check bounds
        if (bounds[0] > bounds[1]).any():
            raise ValueError(
                "LBFGSB - one of the lower bounds is greater than an upper bound."
            )

        # initial vector must lie within the bounds. Otherwise ScalarFunction and
        # approx_derivative will cause problems
        x0s = np.clip(x0s, bounds[0], bounds[1])

    # _prepare_scalar_function can use bounds=None to represent no bounds

    func_and_grad = fun

    states = [OptimState(bounds, maxls, x0, n, m) for x0 in x0s]
    dones = np.zeros(b, bool)
    do_forward = np.zeros(b, bool)

    while 1:
        # prep
        for i in range(b):
            while (
                ~dones[i] & ~do_forward[i]
            ):  # setulb sometimes needs to be called multiple times
                # until it needs new info or is done
                state = states[i]
                # g may become float32 if a user provides a function that calculates
                # the Jacobian in float32 (see gh-18730). The underlying Fortran/C code
                # expects float64, so upcast it
                state.g = state.g.astype(np.float64)
                # x, f, g, wa, iwa, task, csave, lsave, isave, dsave = \
                if uses_c_implementation:
                    _lbfgsb.setulb(
                        m,
                        state.x,
                        state.low_bnd,
                        state.upper_bnd,
                        state.nbd,
                        state.f,
                        state.g,
                        factr,
                        pgtol,
                        state.wa,
                        state.iwa,
                        state.task,
                        state.lsave,
                        state.isave,
                        state.dsave,
                        maxls,
                        state.ln_task,
                    )
                else:  # pragma: no cover
                    _lbfgsb.setulb(
                        m,
                        state.x,
                        state.low_bnd,
                        state.upper_bnd,
                        state.nbd,
                        state.f,
                        state.g,
                        factr,
                        pgtol,
                        state.wa,
                        state.iwa,
                        state.task,
                        -1,  # iprint, default is -1 (not supported by the C impl)
                        state.csave,
                        state.lsave,
                        state.isave,
                        state.dsave,
                        maxls,
                    )

                if not uses_c_implementation:  # pragma: no cover
                    task_str = state.task.tobytes()

                if (
                    state.task[0] == 3
                    if uses_c_implementation
                    else task_str.startswith(b"FG")
                ):
                    # The minimization routine wants f and g at the current x.
                    # Note that interruptions due to maxfun are postponed
                    # until the completion of the current minimization iteration.
                    # Overwrite f and g:
                    # state.f, state.g = func_and_grad(
                    #     state.x
                    # )  # todo potentially use [:] assignment?
                    do_forward[i] = True
                elif (
                    state.task[0] == 1
                    if uses_c_implementation
                    else task_str.startswith(b"NEW_X")
                ):
                    # new iteration
                    state.n_iterations += 1

                    intermediate_result = OptimizeResult(x=state.x, fun=state.f)
                    if _call_callback_maybe_halt(callback, intermediate_result):
                        if uses_c_implementation:
                            state.task[0] = 5
                            state.task[1] = 505
                        else:  # pragma: no cover
                            state.task[:] = "STOP: CALLBACK REQUESTED HALT"
                    if state.n_iterations >= maxiter:
                        if uses_c_implementation:
                            state.task[0] = 5
                            state.task[1] = 504
                        else:  # pragma: no cover
                            state.task[:] = (
                                "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT"
                            )
                else:
                    dones[i] = True

        if np.any(do_forward):  # only the do_forward stuff is worked on
            total_x = np.stack(
                [state.x for state, do_fw in zip(states, do_forward) if do_fw]
            )
            if pass_batch_indices:
                batch_indices = [i for i, do_fw in enumerate(do_forward) if do_fw]
                total_f, total_g = func_and_grad(total_x, batch_indices=batch_indices)
            else:
                total_f, total_g = func_and_grad(total_x)

            for func_i, i in enumerate(
                do_forward.nonzero()[0]
            ):  # taking the 0 as we are interested in the first (and only) dim
                states[i].f = total_f[func_i]
                states[i].g = total_g[func_i]
                states[i].fun_calls += 1

            do_forward[:] = False

        if np.all(dones):
            break

    results = []

    for state in states:
        if not uses_c_implementation:  # pragma: no cover
            task_str = state.task.tobytes().strip(b"\x00").strip()
        if (
            state.task[0] == 4
            if uses_c_implementation
            else task_str.startswith(b"CONV")
        ):
            warnflag = 0
        elif state.n_iterations >= maxiter:
            warnflag = 1
        else:
            warnflag = 2

        # These two portions of the workspace are described in the mainlb
        # subroutine in lbfgsb.f (See line 363), if you are on an older
        # scipy version (< 1.14) and in the function docstring in "__lbfgsb.c",
        # ws and wy arguments, otherwise.
        s = state.wa[0 : m * n].reshape(m, n)
        y = state.wa[m * n : 2 * m * n].reshape(m, n)

        # See lbfgsb.f line 160 for this portion of the workspace.
        # isave(31) = the total number of BFGS updates prior the current iteration;
        n_bfgs_updates = state.isave[30]

        n_corrs = min(n_bfgs_updates, maxcor)
        hess_inv = LbfgsInvHessProduct(s[:n_corrs], y[:n_corrs])

        if uses_c_implementation:
            msg = status_messages[state.task[0]] + ": " + task_messages[state.task[1]]
        else:  # pragma: no cover
            msg = task_str.decode()

        results.append(
            OptimizeResult(
                fun=state.f,
                jac=state.g,
                nfev=state.fun_calls,
                njev=None,
                nit=state.n_iterations,
                status=warnflag,
                message=msg,
                x=state.x,
                success=(warnflag == 0),
                hess_inv=hess_inv,
            )
        )
    return results


# extra helper function
def translate_bounds_for_lbfgsb(
    lower_bounds: tp.Sequence[float | None] | float | None,
    upper_bounds: tp.Sequence[float | None] | float | None,
    num_features: int,
    q: int,
):
    """
    Translates the bounds to the format expected by L-BFGS-B.

    Parameters
    ----------
    lower_bounds : tensor(n,) or float or None
        Lower bounds for the parameters. If None, then the lower bounds
        are unbounded. If float is provided, then that value is used as
        the lower bound for all parameters.
    upper_bounds : tensor(n,) or None
        Upper bounds for the parameters. If None, then the upper bounds
        are unbounded. If float is provided, then that value is used as
        the lower bound for all parameters.
    num_features : int
        Number of features in the model.
    q : int
        Number of repetitions to be optimized jointly in each item.
    fixed_features : dict, optional
        Dictionary mapping indices to the values that they are fixed to.
        These indices will not be included in the returned bounds.
    """
    bounds = [lower_bounds, upper_bounds]
    for i in range(2):
        if bounds[i] is None:
            bounds[i] = num_features * [None]
        elif not isinstance(bounds[i], tp.Iterable):
            bounds[i] = num_features * [bounds[i]]
        else:
            bounds[i] = list(bounds[i])
        if len(bounds[i]) == num_features:
            bounds[i] = sum([bounds[i] for _ in range(q)], [])
        assert len(bounds[i]) == num_features * q, (
            f"Instead got {len(bounds[i])} != {num_features} * {q}."
        )
    return list(zip(*bounds))