#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
r"""
Cross-validation utilities using batch evaluation mode.
"""
from __future__ import annotations
from typing import Any, NamedTuple
import torch
from botorch.exceptions.errors import UnsupportedError
from botorch.fit import fit_gpytorch_mll
from botorch.models.gpytorch import GPyTorchModel
from botorch.models.likelihoods.sparse_outlier_noise import (
SparseOutlierGaussianLikelihood,
)
from botorch.models.multitask import MultiTaskGP
from botorch.posteriors.fully_bayesian import GaussianMixturePosterior
from botorch.posteriors.gpytorch import GPyTorchPosterior
from botorch.posteriors.posterior import Posterior
from gpytorch.distributions import MultitaskMultivariateNormal, MultivariateNormal
from gpytorch.likelihoods import FixedNoiseGaussianLikelihood
from gpytorch.mlls.marginal_log_likelihood import MarginalLogLikelihood
from linear_operator.operators import DiagLinearOperator
from linear_operator.utils.cholesky import psd_safe_cholesky
from torch import Tensor
[docs]
class CVFolds(NamedTuple):
train_X: Tensor
test_X: Tensor
train_Y: Tensor
test_Y: Tensor
train_Yvar: Tensor | None = None
test_Yvar: Tensor | None = None
[docs]
class CVResults(NamedTuple):
"""Results from cross-validation.
This named tuple contains the cross-validation predictions and observed values.
For both ``batch_cross_validation`` and ``efficient_loo_cv``, the ``posterior``
field contains the predictive distribution with mean and variance accessible
via ``posterior.mean`` and ``posterior.variance``.
For ``batch_cross_validation``, the posterior has shape ``n x 1 x m`` where n
is the number of folds, 1 is the single held-out point per fold, and m is the
number of outputs.
For ``efficient_loo_cv``, the posterior has the same shape structure to maintain
consistency, though the underlying distribution is constructed from the
efficient LOO formulas rather than from separate model fits.
NOTE: When ``untransform=True`` is used with a nonlinear outcome transform
(e.g., ``Log``), the posterior will be a ``TransformedPosterior`` rather than
a ``GPyTorchPosterior``. For ensemble models, it will be a
``GaussianMixturePosterior``.
"""
model: GPyTorchModel
posterior: Posterior
observed_Y: Tensor
observed_Yvar: Tensor | None = None
[docs]
def gen_loo_cv_folds(
train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor | None = None
) -> CVFolds:
r"""Generate LOO CV folds w.r.t. to ``n``.
Args:
train_X: A ``n x d`` or ``batch_shape x n x d`` (batch mode) tensor of training
features.
train_Y: A ``n x (m)`` or ``batch_shape x n x (m)`` (batch mode) tensor of
training observations.
train_Yvar: An ``n x (m)`` or ``batch_shape x n x (m)`` (batch mode) tensor
of observed measurement noise.
Returns:
CVFolds NamedTuple with the following fields:
- train_X: A ``n x (n-1) x d`` or ``batch_shape x n x (n-1) x d`` tensor of
training features.
- test_X: A ``n x 1 x d`` or ``batch_shape x n x 1 x d`` tensor of test
features.
- train_Y: A ``n x (n-1) x m`` or ``batch_shape x n x (n-1) x m`` tensor of
training observations.
- test_Y: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of test
observations.
- train_Yvar: A ``n x (n-1) x m`` or ``batch_shape x n x (n-1) x m`` tensor
of observed measurement noise.
- test_Yvar: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of observed
measurement noise.
Example:
>>> train_X = torch.rand(10, 1)
>>> train_Y = torch.rand_like(train_X)
>>> cv_folds = gen_loo_cv_folds(train_X, train_Y)
>>> cv_folds.train_X.shape
torch.Size([10, 9, 1])
"""
masks = torch.eye(train_X.shape[-2], dtype=torch.bool, device=train_X.device)
if train_Y.dim() < train_X.dim():
# add output dimension
train_Y = train_Y.unsqueeze(-1)
if train_Yvar is not None:
train_Yvar = train_Yvar.unsqueeze(-1)
train_X_cv = torch.cat(
[train_X[..., ~m, :].unsqueeze(dim=-3) for m in masks], dim=-3
)
test_X_cv = torch.cat([train_X[..., m, :].unsqueeze(dim=-3) for m in masks], dim=-3)
train_Y_cv = torch.cat(
[train_Y[..., ~m, :].unsqueeze(dim=-3) for m in masks], dim=-3
)
test_Y_cv = torch.cat([train_Y[..., m, :].unsqueeze(dim=-3) for m in masks], dim=-3)
if train_Yvar is None:
train_Yvar_cv = None
test_Yvar_cv = None
else:
train_Yvar_cv = torch.cat(
[train_Yvar[..., ~m, :].unsqueeze(dim=-3) for m in masks], dim=-3
)
test_Yvar_cv = torch.cat(
[train_Yvar[..., m, :].unsqueeze(dim=-3) for m in masks], dim=-3
)
return CVFolds(
train_X=train_X_cv,
test_X=test_X_cv,
train_Y=train_Y_cv,
test_Y=test_Y_cv,
train_Yvar=train_Yvar_cv,
test_Yvar=test_Yvar_cv,
)
[docs]
def batch_cross_validation(
model_cls: type[GPyTorchModel],
mll_cls: type[MarginalLogLikelihood],
cv_folds: CVFolds,
fit_args: dict[str, Any] | None = None,
observation_noise: bool = False,
model_init_kwargs: dict[str, Any] | None = None,
) -> CVResults:
r"""Perform cross validation by using GPyTorch batch mode.
WARNING: This function is currently very memory inefficient; use it only
for problems of small size.
Args:
model_cls: A GPyTorchModel class. This class must initialize the likelihood
internally. Note: Multi-task GPs are not currently supported.
mll_cls: A MarginalLogLikelihood class.
cv_folds: A CVFolds tuple. For LOO-CV with n training points, the leading
dimension of size n represents the n folds (batch dimension), e.g.,
``cv_folds.train_X`` has shape ``n x (n-1) x d`` and ``cv_folds.test_X``
has shape ``n x 1 x d``. This batch structure enables fitting n
independent GPs simultaneously.
fit_args: Arguments passed along to fit_gpytorch_mll.
model_init_kwargs: Keyword arguments passed to the model constructor.
Returns:
A CVResults tuple with the following fields
- model: GPyTorchModel for batched cross validation
- posterior: GPyTorchPosterior where the mean has shape ``n x 1 x m`` or
``batch_shape x n x 1 x m``
- observed_Y: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of
observations.
- observed_Yvar: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor
of observed measurement noise.
Example:
>>> import torch
>>> from botorch.cross_validation import (
... batch_cross_validation, gen_loo_cv_folds
... )
>>>
>>> from botorch.models import SingleTaskGP
>>> from botorch.models.transforms.input import Normalize
>>> from botorch.models.transforms.outcome import Standardize
>>> from gpytorch.mlls import ExactMarginalLogLikelihood
>>> train_X = torch.rand(10, 1)
>>> train_Y = torch.rand_like(train_X)
>>> cv_folds = gen_loo_cv_folds(train_X, train_Y)
>>> input_transform = Normalize(d=train_X.shape[-1])
>>>
>>> cv_results = batch_cross_validation(
... model_cls=SingleTaskGP,
... mll_cls=ExactMarginalLogLikelihood,
... cv_folds=cv_folds,
... model_init_kwargs={
... "input_transform": input_transform,
... },
... )
"""
if issubclass(model_cls, MultiTaskGP):
raise UnsupportedError(
"Multi-task GPs are not currently supported by `batch_cross_validation`."
)
model_init_kws = model_init_kwargs if model_init_kwargs is not None else {}
if cv_folds.train_Yvar is not None:
model_init_kws["train_Yvar"] = cv_folds.train_Yvar
model_cv = model_cls(
train_X=cv_folds.train_X,
train_Y=cv_folds.train_Y,
**model_init_kws,
)
mll_cv = mll_cls(model_cv.likelihood, model_cv)
mll_cv.to(cv_folds.train_X)
fit_args = fit_args or {}
mll_cv = fit_gpytorch_mll(mll_cv, **fit_args)
# Evaluate on the hold-out set in batch mode
with torch.no_grad():
posterior = model_cv.posterior(
cv_folds.test_X, observation_noise=observation_noise
)
return CVResults(
model=model_cv,
posterior=posterior,
observed_Y=cv_folds.test_Y,
observed_Yvar=cv_folds.test_Yvar,
)
[docs]
def loo_cv(
model: GPyTorchModel,
observation_noise: bool = True,
untransform: bool = True,
) -> CVResults:
r"""Compute efficient Leave-One-Out cross-validation for a GP model.
This is a high-level convenience function that automatically dispatches to
the appropriate LOO CV implementation based on the model type:
- For ensemble models (``_is_ensemble=True``): Uses ``ensemble_loo_cv`` which
returns a ``GaussianMixturePosterior`` with both per-member and mixture
statistics.
- For standard GP models: Uses ``efficient_loo_cv`` which returns a
``GPyTorchPosterior`` with the LOO predictive distributions.
Both implementations use efficient O(n³) matrix algebra rather than the
naive O(n⁴) approach of refitting models for each fold.
NOTE: This function does not refit the model to each LOO fold. The model
hyperparameters are kept fixed, providing a fast approximation to full
LOO CV. For models where hyperparameter changes are significant, consider
using ``batch_cross_validation`` instead.
NOTE: The ``untransform`` parameter defaults to True, which means results
are returned in the original outcome space. Callers that previously relied
on results in the model's internal (transformed) space should pass
``untransform=False`` explicitly.
Args:
model: A fitted GPyTorchModel. The model type determines which LOO CV
implementation is used.
observation_noise: If True (default), return the posterior
predictive variance (including observation noise). If False,
return the posterior variance of the latent function (excluding
observation noise). The posterior variance is computed by
subtracting the observation noise from the posterior predictive
variance.
untransform: If True (default), untransform the LOO predictions and
observed values back to the original outcome space when the model
has an outcome transform (e.g., ``Standardize``). This makes the
results consistent with ``model.posterior()`` and
``batch_cross_validation``. If False, return results in the
model's internal (transformed) space.
Returns:
CVResults: A named tuple containing:
- model: The fitted GP model.
- posterior: The LOO predictive distributions. For ensemble models,
this is a ``GaussianMixturePosterior``; otherwise, it's a
``GPyTorchPosterior``.
- observed_Y: The observed Y values.
- observed_Yvar: The observed noise variances (if applicable).
Example:
>>> import torch
>>> from botorch.cross_validation import loo_cv
>>> from botorch.models import SingleTaskGP
>>> from botorch.fit import fit_gpytorch_mll
>>> from gpytorch.mlls import ExactMarginalLogLikelihood
>>>
>>> train_X = torch.rand(20, 2, dtype=torch.float64)
>>> train_Y = torch.sin(train_X).sum(dim=-1, keepdim=True)
>>> model = SingleTaskGP(train_X, train_Y)
>>> mll = ExactMarginalLogLikelihood(model.likelihood, model)
>>> fit_gpytorch_mll(mll)
>>> loo_results = loo_cv(model)
>>> loo_results.posterior.mean.shape
torch.Size([20, 1, 1])
See Also:
- ``efficient_loo_cv``: Direct access to the standard GP implementation.
- ``ensemble_loo_cv``: Direct access to the ensemble model implementation.
- ``batch_cross_validation``: Full LOO CV with model refitting.
"""
loo_fun = (
ensemble_loo_cv if getattr(model, "_is_ensemble", False) else efficient_loo_cv
)
return loo_fun(model, observation_noise=observation_noise, untransform=untransform)
[docs]
def efficient_loo_cv(
model: GPyTorchModel,
observation_noise: bool = True,
untransform: bool = True,
) -> CVResults:
r"""Compute efficient Leave-One-Out cross-validation for a GP model.
NOTE: This function does not refit the model to each LOO fold, in contrast to
batch_cross_validation. This is a memory- and compute-efficient way to compute LOO,
but it does not account for potential changes in the model parameters due to the
removal of a single observation. This is typically ok in cases with a lot of data,
but can result in substantial differences (typically over-estimating performance)
in the low data regime.
This function leverages a well-known linear algebraic identity to compute
all LOO predictive distributions in O(n^3) time, compared to the naive
approach which requires O(n^4) time (O(n^3) per fold for n folds).
The efficient LOO formulas for GPs are:
.. math::
\mu_{LOO,i} = y_i - \frac{[K^{-1}(y - \mu)]_i}{[K^{-1}]_{ii}}
\sigma^2_{LOO,i} = \frac{1}{[K^{-1}]_{ii}}
where K is the covariance matrix including observation noise. This gives
the posterior predictive variance (including noise). To get the posterior
variance (excluding noise), we subtract the observation noise:
.. math::
\sigma^2_{posterior,i} = \sigma^2_{LOO,i} - \sigma^2_{noise}
NOTE: This function assumes the model has already been fitted and that the
model's ``forward`` method returns a ``MultivariateNormal`` distribution.
Args:
model: A fitted GPyTorchModel whose ``forward`` method returns a
``MultivariateNormal`` distribution.
observation_noise: If True (default), return the posterior
predictive variance (including observation noise). If False,
return the posterior variance of the latent function (excluding
observation noise).
untransform: If True (default), untransform the LOO predictions and
observed values back to the original outcome space when the model
has an outcome transform (e.g., ``Standardize``). This makes the
results consistent with ``model.posterior()`` and
``batch_cross_validation``. If False, return results in the
model's internal (transformed) space.
Returns:
CVResults: A named tuple containing:
- model: The fitted GP model.
- posterior: The LOO predictive distributions (typically a
``GPyTorchPosterior``; with nonlinear outcome transforms like
``Log``, a ``TransformedPosterior``). The posterior mean and
variance have shape ``n x 1 x m`` or
``batch_shape x n x 1 x m``, matching the structure of
``batch_cross_validation`` (n folds, 1 held-out point per fold,
m outputs). The underlying distribution has diagonal covariance
since LOO predictions at different held-out points are computed
independently.
- observed_Y: The observed Y values with shape ``n x 1 x m`` or
``batch_shape x n x 1 x m``.
- observed_Yvar: The observed noise variances (if provided) with shape
``n x 1 x m`` or ``batch_shape x n x 1 x m``.
Example:
>>> import torch
>>> from botorch.cross_validation import efficient_loo_cv
>>> from botorch.models import SingleTaskGP
>>> from botorch.fit import fit_gpytorch_mll
>>> from gpytorch.mlls import ExactMarginalLogLikelihood
>>>
>>> train_X = torch.rand(20, 2, dtype=torch.float64)
>>> train_Y = torch.sin(train_X).sum(dim=-1, keepdim=True)
>>> model = SingleTaskGP(train_X, train_Y)
>>> mll = ExactMarginalLogLikelihood(model.likelihood, model)
>>> fit_gpytorch_mll(mll)
>>> loo_results = efficient_loo_cv(model)
>>> loo_results.posterior.mean.shape
torch.Size([20, 1, 1])
"""
# Compute raw LOO predictions
loo_mean, loo_variance, train_Y = _compute_loo_predictions(
model, observation_noise=observation_noise
)
# Get the number of outputs
num_outputs = model.num_outputs
# Build the posterior from raw LOO predictions
posterior = _build_loo_posterior(
loo_mean=loo_mean, loo_variance=loo_variance, num_outputs=num_outputs
)
# Reshape observed data to LOO CV output format: n x 1 x m
observed_Y = _reshape_to_loo_cv_format(train_Y, num_outputs)
# Get observed Yvar if available (for fixed noise models)
observed_Yvar = None
if isinstance(model.likelihood, FixedNoiseGaussianLikelihood):
observed_Yvar = _reshape_to_loo_cv_format(model.likelihood.noise, num_outputs)
# Untransform predictions and observed values back to original space
if untransform and hasattr(model, "outcome_transform"):
posterior, observed_Y, observed_Yvar = _untransform_loo_results(
model=model,
posterior=posterior,
observed_Y=observed_Y,
observed_Yvar=observed_Yvar,
)
return CVResults(
model=model,
posterior=posterior,
observed_Y=observed_Y,
observed_Yvar=observed_Yvar,
)
def _untransform_loo_results(
model: GPyTorchModel,
posterior: Posterior,
observed_Y: Tensor,
observed_Yvar: Tensor | None,
) -> tuple[Posterior, Tensor, Tensor | None]:
r"""Untransform LOO CV results from model-internal space to original space.
Applies the model's outcome transform to map the LOO posterior and observed
values back to the original (untransformed) outcome space. This uses
``outcome_transform.untransform_posterior`` for the posterior and
``outcome_transform.untransform`` for the observed values.
For linear transforms like ``Standardize``, the posterior is analytically
rescaled and remains a ``GPyTorchPosterior``. For nonlinear transforms like
``Log``, the posterior is wrapped in a ``TransformedPosterior``.
Args:
model: The GP model with an ``outcome_transform`` attribute.
posterior: The LOO posterior in the model's internal (transformed) space.
Shape: ``n x 1 x m`` or ``batch_shape x n x 1 x m``.
observed_Y: The observed Y values in transformed space with shape
``n x 1 x m`` or ``batch_shape x n x 1 x m``.
observed_Yvar: The observed noise variances in transformed space (if
applicable) with shape ``n x 1 x m`` or ``batch_shape x n x 1 x m``.
Returns:
A tuple of (posterior, observed_Y, observed_Yvar) in the original
(untransformed) outcome space. Note: if the outcome transform has a
``batch_shape`` (e.g., ``Standardize(batch_shape=[num_models])``),
broadcasting during untransform may add leading dimensions to
``observed_Y`` that the caller must align with the posterior layout.
"""
outcome_transform = model.outcome_transform
# Validate observed_Y shape before any transforms.
if observed_Y.shape[-2] != 1:
raise ValueError(
"Expected observed_Y to have size 1 at dim -2 (q dimension), "
f"got shape {observed_Y.shape}."
)
posterior = outcome_transform.untransform_posterior(posterior)
# Untransform observed_Y (and observed_Yvar if present).
# observed_Y has shape n x 1 x m; Standardize.untransform expects
# batch_shape x n x m. We squeeze the q=1 dim, untransform, and restore it.
observed_Y_squeezed = observed_Y.squeeze(-2)
observed_Yvar_squeezed = (
observed_Yvar.squeeze(-2) if observed_Yvar is not None else None
)
observed_Y_utf, observed_Yvar_utf = outcome_transform.untransform(
observed_Y_squeezed, observed_Yvar_squeezed
)
observed_Y = observed_Y_utf.unsqueeze(-2)
observed_Yvar = (
observed_Yvar_utf.unsqueeze(-2) if observed_Yvar_utf is not None else None
)
return posterior, observed_Y, observed_Yvar
def _likelihood_requires_X(likelihood: object) -> bool:
r"""Check if a likelihood requires training inputs for noise computation.
Returns True for likelihoods like SparseOutlierGaussianLikelihood that
need training inputs to determine per-point noise (e.g., outlier variances).
Standard GaussianLikelihood does not need training inputs.
"""
return isinstance(likelihood, SparseOutlierGaussianLikelihood)
def _subtract_observation_noise(model: GPyTorchModel, loo_variance: Tensor) -> Tensor:
r"""Subtract observation noise from LOO variance to get posterior variance.
The efficient LOO formula computes the posterior predictive variance, which
includes observation noise. To get the posterior variance of the latent
function (without noise), we subtract the observation noise variance.
This implementation uses the likelihood's ``forward`` method to extract noise
variances in a general way. The ``forward`` method takes function samples and
returns a distribution where the variance represents the observation noise.
.. math::
\sigma^2_{posterior,i} = \sigma^2_{LOO,i} - \sigma^2_{noise}
Args:
model: The GP model with a likelihood containing the noise variance.
loo_variance: The LOO posterior predictive variance with shape
``... x n x 1``.
Returns:
The posterior variance (without noise) with the same shape.
"""
likelihood = model.likelihood
# Use the likelihood's forward method to extract noise variances.
# By passing zeros as function samples, the returned distribution's
# variance gives us the observation noise at each point.
noise_shape = loo_variance.shape[:-1] # ... x n
zeros = torch.zeros(
noise_shape, dtype=loo_variance.dtype, device=loo_variance.device
)
# Only pass training inputs when the likelihood needs them (e.g.,
# SparseOutlierGaussianLikelihood for per-point outlier noise).
# Standard GaussianLikelihood doesn't need inputs, and passing
# pre-transform inputs can cause shape mismatches with models that
# use dimension-changing input transforms (e.g., AppendFeatures).
if _likelihood_requires_X(likelihood):
train_inputs = getattr(model, "train_inputs", None)
noise_dist = likelihood.forward(
zeros, train_inputs[0] if train_inputs else None
)
else:
noise_dist = likelihood.forward(zeros)
# Extract noise variance and reshape to match loo_variance
noise = noise_dist.variance.unsqueeze(-1) # ... x n x 1
loo_variance = loo_variance - noise
# Clamp to ensure non-negative variance
return loo_variance.clamp(min=0.0)
def _compute_loo_predictions(
model: GPyTorchModel,
observation_noise: bool = True,
) -> tuple[Tensor, Tensor, Tensor]:
r"""Compute raw LOO predictions (means and variances) for a GP model.
This is an internal helper that computes the leave-one-out predictive means
and variances using efficient matrix algebra. The formulas are:
.. math::
\mu_{LOO,i} = y_i - \frac{[K^{-1}(y - \mu)]_i}{[K^{-1}]_{ii}}
\sigma^2_{LOO,i} = \frac{1}{[K^{-1}]_{ii}}
where K is the covariance matrix including observation noise and μ is the
prior mean. This gives the posterior predictive variance (including noise).
To get the posterior variance (excluding noise), we subtract the observation
noise variance.
Args:
model: A fitted GPyTorchModel in eval mode whose ``forward`` method returns
a ``MultivariateNormal`` distribution.
observation_noise: If True (default), return the posterior
predictive variance (including observation noise). If False,
return the posterior variance of the latent function (excluding
observation noise).
Returns:
A tuple of (loo_mean, loo_variance, train_Y) where:
- loo_mean: LOO predictive means with shape ``... x n x 1``
- loo_variance: LOO predictive variances with shape ``... x n x 1``
- train_Y: The training targets from the model
Raises:
UnsupportedError: If the model doesn't have required attributes or
the forward method doesn't return a MultivariateNormal.
"""
# Get training data - model should have train_inputs attribute
if not hasattr(model, "train_inputs") or model.train_inputs is None:
raise UnsupportedError(
"Model must have train_inputs attribute for efficient LOO CV."
)
if not hasattr(model, "train_targets") or model.train_targets is None:
raise UnsupportedError(
"Model must have train_targets attribute for efficient LOO CV."
)
train_X = model.train_inputs[0] # Shape: n x d or batch_shape x n x d
# Check for models with auxiliary inputs (e.g., auxiliary experiment data)
# In such models, train_inputs[0] is a tuple of tensors rather than a single tensor
if isinstance(train_X, tuple):
raise UnsupportedError(
"Efficient LOO CV is not supported for models with auxiliary inputs. "
"train_inputs[0] is a tuple of tensors, indicating auxiliary data."
)
train_Y = model.train_targets # Shape: n or batch_shape x n (batched outputs)
n = train_X.shape[-2]
prior_dist = model.forward(train_X)
# Check that we got a MultivariateNormal
if not isinstance(prior_dist, MultivariateNormal):
raise UnsupportedError(
f"Model's forward method must return a MultivariateNormal, "
f"got {type(prior_dist).__name__}."
)
# Extract mean from the prior
# Shape: n for single-output, or m x n for batched multi-output
mean = prior_dist.mean
# Add observation noise to the diagonal via the likelihood
# The likelihood adds noise: K_noisy = K + sigma^2 * I
# Only pass training inputs when the likelihood needs them (e.g.,
# SparseOutlierGaussianLikelihood for per-point outlier noise).
# Standard GaussianLikelihood doesn't need inputs, and passing
# pre-transform inputs can cause shape mismatches with models that
# use dimension-changing input transforms (e.g., AppendFeatures).
if _likelihood_requires_X(model.likelihood):
noisy_mvn = model.likelihood(prior_dist, model.train_inputs[0])
else:
noisy_mvn = model.likelihood(prior_dist)
# Get the covariance matrix - use lazy representation for potential caching
K_noisy = noisy_mvn.lazy_covariance_matrix.to_dense()
# Compute Cholesky decomposition (adds jitter if needed)
L = psd_safe_cholesky(K_noisy)
# Compute K^{-1}(y - mean) via Cholesky solve
# Shape: ... x n x 1 where ... is batch_shape (includes m for multi-output)
residuals = (train_Y - mean).unsqueeze(-1)
K_inv_residuals = torch.cholesky_solve(residuals, L)
# Compute diagonal of K^{-1}
# K_inv = L^{-T} @ L^{-1}, so K_inv_diag[i] = sum_j (L^{-1}[j,i])^2
identity = torch.eye(n, dtype=L.dtype, device=L.device)
if L.dim() > 2:
identity = identity.expand(*L.shape[:-2], n, n)
L_inv = torch.linalg.solve_triangular(L, identity, upper=False)
K_inv_diag = (L_inv**2).sum(dim=-2) # ... x n
# Compute LOO predictions using the efficient formulas:
# sigma2_loo_i = 1 / [K^{-1}]_{ii}
# mu_loo_i = y_i - [K^{-1}(y - mean)]_i / [K^{-1}]_{ii}
# K_inv_diag has shape ... x n, so after unsqueeze(-1) we get ... x n x 1
# (the last dim is 1 because each GP is single-output).
loo_variance = (1.0 / K_inv_diag).unsqueeze(-1) # ... x n x 1
loo_mean = train_Y.unsqueeze(-1) - K_inv_residuals * loo_variance
# If we want the posterior (noiseless) variance, subtract the noise
if not observation_noise:
loo_variance = _subtract_observation_noise(model, loo_variance)
return loo_mean, loo_variance, train_Y
def _build_loo_posterior(
loo_mean: Tensor,
loo_variance: Tensor,
num_outputs: int,
) -> GPyTorchPosterior:
r"""Build a GPyTorchPosterior from raw LOO predictions.
Args:
loo_mean: LOO means with shape ``... x m x n x 1`` (multi-output) or
``... x n x 1`` (single-output), where ``...`` is optional batch_shape.
loo_variance: LOO variances with same shape as loo_mean.
num_outputs: Number of outputs (m). 1 for single-output models.
Returns:
A GPyTorchPosterior with shape ``... x n x 1 x m``.
"""
# Reshape tensors to final shape: ... x n x 1 x m
if num_outputs > 1:
# Multi-output: ... x m x n x 1 -> ... x n x 1 x m
# The m dimension is at position -3, move it to position -1
loo_mean = loo_mean.movedim(-3, -1)
loo_variance = loo_variance.movedim(-3, -1)
else:
# Single-output: ... x n x 1 -> ... x n x 1 x 1
loo_mean = loo_mean.unsqueeze(-1)
loo_variance = loo_variance.unsqueeze(-1)
# Create distribution: for multi-output use MTMVN, for single-output use MVN.
# Both require mean shape ... x n x q (where q=1) and diagonal covariance.
# We squeeze the m dimension to get ... x n x 1 for the MVN mean, then
# iterate over outputs to create independent MVNs.
mvns = [
MultivariateNormal(
mean=loo_mean[..., t],
covariance_matrix=DiagLinearOperator(loo_variance[..., t]),
)
for t in range(num_outputs)
]
if num_outputs > 1:
mvn = MultitaskMultivariateNormal.from_independent_mvns(mvns=mvns)
else:
mvn = mvns[0]
return GPyTorchPosterior(distribution=mvn)
def _reshape_to_loo_cv_format(tensor: Tensor, num_outputs: int) -> Tensor:
r"""Reshape a tensor to the standard LOO CV output format: ``n x 1 x m``.
This helper converts tensors with internal model format (which varies by
number of outputs) to the consistent output format used by LOO CV results.
Args:
tensor: Input tensor with shape:
- Single-output: ``n`` (1D)
- Multi-output: ``m x n`` (2D)
num_outputs: Number of outputs (m). 1 for single-output models.
Returns:
Reshaped tensor with shape ``n x 1 x m``.
"""
if num_outputs > 1:
# Multi-output: m x n -> n x m -> n x 1 x m
return tensor.movedim(-2, -1).unsqueeze(-2)
else:
# Single-output: n -> n x 1 -> n x 1 x 1
return tensor.unsqueeze(-1).unsqueeze(-1)
[docs]
def ensemble_loo_cv(
model: GPyTorchModel,
observation_noise: bool = True,
untransform: bool = True,
) -> CVResults:
r"""Compute efficient LOO cross-validation for ensemble models.
This function computes Leave-One-Out cross-validation for ensemble models
like ``SaasFullyBayesianSingleTaskGP``. For these models, the ``forward`` method
returns a ``MultivariateNormal`` with a batch dimension containing statistics
for all models in the ensemble.
The LOO predictions from each ensemble member form a Gaussian mixture.
This function returns a ``CVResults`` with a ``GaussianMixturePosterior`` that
provides both per-member statistics (via ``posterior.mean`` and
``posterior.variance``) and aggregated mixture statistics (via
``posterior.mixture_mean`` and ``posterior.mixture_variance``).
The mixture statistics are computed using the law of total variance:
.. math::
\mu_{mix} = \frac{1}{K} \sum_{k=1}^{K} \mu_k
\sigma^2_{mix} = \frac{1}{K} \sum_{k=1}^{K} \sigma^2_k +
\frac{1}{K} \sum_{k=1}^{K} \mu_k^2 - \mu_{mix}^2
where K is the number of ensemble members.
NOTE: This function assumes the model has already been fitted (e.g., using
``fit_fully_bayesian_model_nuts``) and that the model is an ensemble model
with ``_is_ensemble = True``.
Args:
model: An ensemble GPyTorchModel (e.g., SaasFullyBayesianSingleTaskGP)
whose ``forward`` method returns a ``MultivariateNormal`` distribution
with a batch dimension for ensemble members.
observation_noise: If True (default), return the posterior
predictive variance (including observation noise). If False,
return the posterior variance of the latent function (excluding
observation noise).
untransform: If True (default), untransform the LOO predictions and
observed values back to the original outcome space when the model
has an outcome transform (e.g., ``Standardize``). This makes the
results consistent with ``model.posterior()`` and
``batch_cross_validation``. If False, return results in the
model's internal (transformed) space.
Returns:
CVResults: A named tuple containing:
- model: The fitted ensemble GP model.
- posterior: A ``GaussianMixturePosterior`` with per-member shape
``n x num_models x 1 x m``. Access per-member statistics via
``posterior.mean`` and ``posterior.variance``, and mixture
statistics via ``posterior.mixture_mean`` and
``posterior.mixture_variance``.
- observed_Y: The observed Y values with shape
``n x num_models x 1 x m``, matching the posterior layout so
that element-wise operations (e.g., ``posterior.mean -
observed_Y``) work correctly.
- observed_Yvar: The observed noise variances (if provided) with
the same shape as ``observed_Y``.
Example:
>>> import torch
>>> from botorch.cross_validation import ensemble_loo_cv
>>> from botorch.models.fully_bayesian import SaasFullyBayesianSingleTaskGP
>>> from botorch.models.fully_bayesian import fit_fully_bayesian_model_nuts
>>>
>>> train_X = torch.rand(20, 2, dtype=torch.float64)
>>> train_Y = torch.sin(train_X).sum(dim=-1, keepdim=True)
>>> model = SaasFullyBayesianSingleTaskGP(train_X, train_Y)
>>> fit_fully_bayesian_model_nuts(model, warmup_steps=64, num_samples=32)
>>> loo_results = ensemble_loo_cv(model)
>>> loo_results.posterior.mean.shape # Per-member means
torch.Size([20, 32, 1, 1])
>>> loo_results.posterior.mixture_mean.shape # Aggregated mixture mean
torch.Size([20, 1, 1])
"""
# Check that this is an ensemble model
if not getattr(model, "_is_ensemble", False):
raise UnsupportedError(
"ensemble_loo_cv requires an ensemble model (with _is_ensemble=True). "
f"Got model of type {type(model).__name__}. "
"For non-ensemble models, use efficient_loo_cv instead."
)
# Compute raw LOO predictions
# For ensemble models, shapes are: num_models x n x 1
loo_mean, loo_variance, train_Y = _compute_loo_predictions(
model, observation_noise=observation_noise
)
# Validate that we have the expected batch dimension for ensemble
if loo_mean.dim() < 3:
raise UnsupportedError(
"Expected ensemble model to produce batched LOO results with shape "
f"(batch_shape x num_models x n x 1), but got shape {loo_mean.shape}."
)
# Get the number of outputs
num_outputs = model.num_outputs
# Build the GaussianMixturePosterior
posterior = _build_ensemble_loo_posterior(
loo_mean=loo_mean, loo_variance=loo_variance, num_outputs=num_outputs
)
# Extract observed data (first ensemble member) and reshape to LOO CV format
observed_Y, observed_Yvar = _get_ensemble_observed_data(
model=model, train_Y=train_Y, num_outputs=num_outputs
)
# Untransform predictions and observed values back to original space
if untransform and hasattr(model, "outcome_transform"):
observed_Y_ndim = observed_Y.dim()
posterior, observed_Y, observed_Yvar = _untransform_loo_results(
model=model,
posterior=posterior,
observed_Y=observed_Y,
observed_Yvar=observed_Yvar,
)
# After untransform with a batch-shaped outcome transform (e.g.,
# Standardize(batch_shape=[num_models])), observed_Y gains leading
# batch dimensions from the transform. For example, with
# batch_shape=[num_models], observed_Y goes from [n, 1, m] to
# [num_models, n, 1, m]. The posterior has num_models at MCMC_DIM=-3,
# giving shape [n, num_models, 1, m]. Align observed_Y by moving all
# added leading dims to just before the q=1 dim (MCMC_DIM position).
n_added = observed_Y.dim() - observed_Y_ndim
if n_added > 1:
raise UnsupportedError(
"ensemble_loo_cv does not support outcome transforms that add "
f"more than 1 batch dimension. Got {n_added} added dimensions "
f"(shape went from {observed_Y_ndim}D to {observed_Y.dim()}D)."
)
if n_added == 1:
observed_Y = observed_Y.movedim(0, -3)
if observed_Yvar is not None:
observed_Yvar = observed_Yvar.movedim(0, -3)
# Ensure observed_Y has the num_models dimension at MCMC_DIM=-3 to match
# the posterior shape [n, num_models, 1, m]. Without this, element-wise
# operations like `posterior.mean - observed_Y` would fail due to
# incompatible broadcasting (3D [n, 1, m] vs 4D [n, num_models, 1, m]).
posterior_mean = posterior.mean
if observed_Y.dim() < posterior_mean.dim():
observed_Y = observed_Y.unsqueeze(-3).expand_as(posterior_mean)
if observed_Yvar is not None:
observed_Yvar = observed_Yvar.unsqueeze(-3).expand_as(posterior_mean)
return CVResults(
model=model,
posterior=posterior,
observed_Y=observed_Y,
observed_Yvar=observed_Yvar,
)
def _build_ensemble_loo_posterior(
loo_mean: Tensor,
loo_variance: Tensor,
num_outputs: int,
) -> GaussianMixturePosterior:
r"""Build a GaussianMixturePosterior from raw ensemble LOO predictions.
This function takes raw LOO means and variances from an ensemble model
(computed by ``_compute_loo_predictions``) and packages them into a
GaussianMixturePosterior that provides both per-member and mixture statistics.
Args:
loo_mean: LOO means with shape ``batch_shape x num_models x n x 1``
(single-output) or ``batch_shape x num_models x m x n x 1``
(multi-output).
loo_variance: LOO variances with same shape as loo_mean.
num_outputs: Number of outputs (m). 1 for single-output models.
Returns:
GaussianMixturePosterior with shape ``batch_shape x n x num_models x 1 x m``.
The num_models dimension is at MCMC_DIM=-3.
"""
# Normalize shapes: add m=1 dimension for single-output to match multi-output
if num_outputs == 1:
# Single-output: ... x num_models x n x 1 -> ... x num_models x 1 x n x 1
loo_mean = loo_mean.unsqueeze(-3)
loo_variance = loo_variance.unsqueeze(-3)
# Now both cases have shape: ... x num_models x m x n x 1
# Transform to target shape: ... x n x num_models x 1 x m
# 1. squeeze(-1): ... x num_models x m x n
# 2. movedim(-1, -3): ... x n x num_models x m (move n before num_models)
# 3. unsqueeze(-2): ... x n x num_models x 1 x m
loo_mean = loo_mean.squeeze(-1).movedim(-1, -3).unsqueeze(-2)
loo_variance = loo_variance.squeeze(-1).movedim(-1, -3).unsqueeze(-2)
# Create distribution: iterate over outputs to create independent MVNs
# After indexing with [..., t], shape is: batch_shape x n x num_models x 1
mvns = [
MultivariateNormal(
mean=loo_mean[..., t],
covariance_matrix=DiagLinearOperator(loo_variance[..., t]),
)
for t in range(num_outputs)
]
if num_outputs > 1:
mvn = MultitaskMultivariateNormal.from_independent_mvns(mvns=mvns)
else:
mvn = mvns[0]
return GaussianMixturePosterior(distribution=mvn)
def _get_ensemble_observed_data(
model: GPyTorchModel,
train_Y: Tensor,
num_outputs: int,
) -> tuple[Tensor, Tensor | None]:
r"""Extract observed data from an ensemble model for LOO CV.
Extracts the first ensemble member's training targets and observation noise,
verifies all members share the same data, and reshapes to LOO CV format.
Args:
model: The ensemble GP model.
train_Y: Training targets with shape ``... x num_models x n`` (single-output)
or ``... x num_models x m x n`` (multi-output).
num_outputs: Number of outputs (m).
Returns:
(observed_Y, observed_Yvar) with shape ``... x n x 1 x m``.
Raises:
UnsupportedError: If ensemble members have different training data.
"""
# num_models is at dim -2 for single-output, -3 for multi-output
num_models_dim = -2 if num_outputs == 1 else -3
# Verify all ensemble members share the same training data
_verify_ensemble_data_consistency(train_Y, num_models_dim, "train_Y")
# Extract first ensemble member's data (they're all the same)
train_Y_first = train_Y.select(num_models_dim, 0)
observed_Y = _reshape_to_loo_cv_format(train_Y_first, num_outputs)
# Get observed Yvar if available (for fixed noise models)
observed_Yvar = None
if isinstance(model.likelihood, FixedNoiseGaussianLikelihood):
noise = model.likelihood.noise
# Noise has the same shape structure as train_Y
# Verify consistency and extract first member
if noise.dim() > 1:
_verify_ensemble_data_consistency(
noise, num_models_dim, "observation noise"
)
noise = noise.select(num_models_dim, 0)
observed_Yvar = _reshape_to_loo_cv_format(noise, num_outputs)
return observed_Y, observed_Yvar
def _verify_ensemble_data_consistency(
tensor: Tensor,
num_models_dim: int,
tensor_name: str,
) -> None:
r"""Verify all ensemble members have identical data along ``num_models_dim``.
Args:
tensor: Data tensor with a num_models dimension.
num_models_dim: Dimension index for num_models (typically -2 or -3).
tensor_name: Name for error messages (e.g., "train_Y").
Raises:
UnsupportedError: If data differs across ensemble members.
"""
num_models = tensor.shape[num_models_dim]
if num_models <= 1:
return
first_member = tensor.select(num_models_dim, 0)
first_expanded = first_member.unsqueeze(num_models_dim).expand_as(tensor)
if not torch.equal(tensor, first_expanded):
raise UnsupportedError(
f"Ensemble members have different {tensor_name}. "
"ensemble_loo_cv only supports ensembles where all members share the "
"same training data (e.g., fully Bayesian models with MCMC samples). "
"For ensembles with different data per member, cross-validate each "
"member individually using efficient_loo_cv."
)