Source code for botorch.cross_validation

#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

r"""
Cross-validation utilities using batch evaluation mode.
"""

from __future__ import annotations

from typing import Any, NamedTuple

import torch
from botorch.exceptions.errors import UnsupportedError
from botorch.fit import fit_gpytorch_mll
from botorch.models.gpytorch import GPyTorchModel
from botorch.models.likelihoods.sparse_outlier_noise import (
    SparseOutlierGaussianLikelihood,
)
from botorch.models.multitask import MultiTaskGP
from botorch.posteriors.fully_bayesian import GaussianMixturePosterior
from botorch.posteriors.gpytorch import GPyTorchPosterior
from botorch.posteriors.posterior import Posterior
from gpytorch.distributions import MultitaskMultivariateNormal, MultivariateNormal
from gpytorch.likelihoods import FixedNoiseGaussianLikelihood
from gpytorch.mlls.marginal_log_likelihood import MarginalLogLikelihood
from linear_operator.operators import DiagLinearOperator
from linear_operator.utils.cholesky import psd_safe_cholesky
from torch import Tensor


[docs] class CVFolds(NamedTuple): train_X: Tensor test_X: Tensor train_Y: Tensor test_Y: Tensor train_Yvar: Tensor | None = None test_Yvar: Tensor | None = None
[docs] class CVResults(NamedTuple): """Results from cross-validation. This named tuple contains the cross-validation predictions and observed values. For both ``batch_cross_validation`` and ``efficient_loo_cv``, the ``posterior`` field contains the predictive distribution with mean and variance accessible via ``posterior.mean`` and ``posterior.variance``. For ``batch_cross_validation``, the posterior has shape ``n x 1 x m`` where n is the number of folds, 1 is the single held-out point per fold, and m is the number of outputs. For ``efficient_loo_cv``, the posterior has the same shape structure to maintain consistency, though the underlying distribution is constructed from the efficient LOO formulas rather than from separate model fits. NOTE: When ``untransform=True`` is used with a nonlinear outcome transform (e.g., ``Log``), the posterior will be a ``TransformedPosterior`` rather than a ``GPyTorchPosterior``. For ensemble models, it will be a ``GaussianMixturePosterior``. """ model: GPyTorchModel posterior: Posterior observed_Y: Tensor observed_Yvar: Tensor | None = None
[docs] def gen_loo_cv_folds( train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor | None = None ) -> CVFolds: r"""Generate LOO CV folds w.r.t. to ``n``. Args: train_X: A ``n x d`` or ``batch_shape x n x d`` (batch mode) tensor of training features. train_Y: A ``n x (m)`` or ``batch_shape x n x (m)`` (batch mode) tensor of training observations. train_Yvar: An ``n x (m)`` or ``batch_shape x n x (m)`` (batch mode) tensor of observed measurement noise. Returns: CVFolds NamedTuple with the following fields: - train_X: A ``n x (n-1) x d`` or ``batch_shape x n x (n-1) x d`` tensor of training features. - test_X: A ``n x 1 x d`` or ``batch_shape x n x 1 x d`` tensor of test features. - train_Y: A ``n x (n-1) x m`` or ``batch_shape x n x (n-1) x m`` tensor of training observations. - test_Y: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of test observations. - train_Yvar: A ``n x (n-1) x m`` or ``batch_shape x n x (n-1) x m`` tensor of observed measurement noise. - test_Yvar: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of observed measurement noise. Example: >>> train_X = torch.rand(10, 1) >>> train_Y = torch.rand_like(train_X) >>> cv_folds = gen_loo_cv_folds(train_X, train_Y) >>> cv_folds.train_X.shape torch.Size([10, 9, 1]) """ masks = torch.eye(train_X.shape[-2], dtype=torch.bool, device=train_X.device) if train_Y.dim() < train_X.dim(): # add output dimension train_Y = train_Y.unsqueeze(-1) if train_Yvar is not None: train_Yvar = train_Yvar.unsqueeze(-1) train_X_cv = torch.cat( [train_X[..., ~m, :].unsqueeze(dim=-3) for m in masks], dim=-3 ) test_X_cv = torch.cat([train_X[..., m, :].unsqueeze(dim=-3) for m in masks], dim=-3) train_Y_cv = torch.cat( [train_Y[..., ~m, :].unsqueeze(dim=-3) for m in masks], dim=-3 ) test_Y_cv = torch.cat([train_Y[..., m, :].unsqueeze(dim=-3) for m in masks], dim=-3) if train_Yvar is None: train_Yvar_cv = None test_Yvar_cv = None else: train_Yvar_cv = torch.cat( [train_Yvar[..., ~m, :].unsqueeze(dim=-3) for m in masks], dim=-3 ) test_Yvar_cv = torch.cat( [train_Yvar[..., m, :].unsqueeze(dim=-3) for m in masks], dim=-3 ) return CVFolds( train_X=train_X_cv, test_X=test_X_cv, train_Y=train_Y_cv, test_Y=test_Y_cv, train_Yvar=train_Yvar_cv, test_Yvar=test_Yvar_cv, )
[docs] def batch_cross_validation( model_cls: type[GPyTorchModel], mll_cls: type[MarginalLogLikelihood], cv_folds: CVFolds, fit_args: dict[str, Any] | None = None, observation_noise: bool = False, model_init_kwargs: dict[str, Any] | None = None, ) -> CVResults: r"""Perform cross validation by using GPyTorch batch mode. WARNING: This function is currently very memory inefficient; use it only for problems of small size. Args: model_cls: A GPyTorchModel class. This class must initialize the likelihood internally. Note: Multi-task GPs are not currently supported. mll_cls: A MarginalLogLikelihood class. cv_folds: A CVFolds tuple. For LOO-CV with n training points, the leading dimension of size n represents the n folds (batch dimension), e.g., ``cv_folds.train_X`` has shape ``n x (n-1) x d`` and ``cv_folds.test_X`` has shape ``n x 1 x d``. This batch structure enables fitting n independent GPs simultaneously. fit_args: Arguments passed along to fit_gpytorch_mll. model_init_kwargs: Keyword arguments passed to the model constructor. Returns: A CVResults tuple with the following fields - model: GPyTorchModel for batched cross validation - posterior: GPyTorchPosterior where the mean has shape ``n x 1 x m`` or ``batch_shape x n x 1 x m`` - observed_Y: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of observations. - observed_Yvar: A ``n x 1 x m`` or ``batch_shape x n x 1 x m`` tensor of observed measurement noise. Example: >>> import torch >>> from botorch.cross_validation import ( ... batch_cross_validation, gen_loo_cv_folds ... ) >>> >>> from botorch.models import SingleTaskGP >>> from botorch.models.transforms.input import Normalize >>> from botorch.models.transforms.outcome import Standardize >>> from gpytorch.mlls import ExactMarginalLogLikelihood >>> train_X = torch.rand(10, 1) >>> train_Y = torch.rand_like(train_X) >>> cv_folds = gen_loo_cv_folds(train_X, train_Y) >>> input_transform = Normalize(d=train_X.shape[-1]) >>> >>> cv_results = batch_cross_validation( ... model_cls=SingleTaskGP, ... mll_cls=ExactMarginalLogLikelihood, ... cv_folds=cv_folds, ... model_init_kwargs={ ... "input_transform": input_transform, ... }, ... ) """ if issubclass(model_cls, MultiTaskGP): raise UnsupportedError( "Multi-task GPs are not currently supported by `batch_cross_validation`." ) model_init_kws = model_init_kwargs if model_init_kwargs is not None else {} if cv_folds.train_Yvar is not None: model_init_kws["train_Yvar"] = cv_folds.train_Yvar model_cv = model_cls( train_X=cv_folds.train_X, train_Y=cv_folds.train_Y, **model_init_kws, ) mll_cv = mll_cls(model_cv.likelihood, model_cv) mll_cv.to(cv_folds.train_X) fit_args = fit_args or {} mll_cv = fit_gpytorch_mll(mll_cv, **fit_args) # Evaluate on the hold-out set in batch mode with torch.no_grad(): posterior = model_cv.posterior( cv_folds.test_X, observation_noise=observation_noise ) return CVResults( model=model_cv, posterior=posterior, observed_Y=cv_folds.test_Y, observed_Yvar=cv_folds.test_Yvar, )
[docs] def loo_cv( model: GPyTorchModel, observation_noise: bool = True, untransform: bool = True, ) -> CVResults: r"""Compute efficient Leave-One-Out cross-validation for a GP model. This is a high-level convenience function that automatically dispatches to the appropriate LOO CV implementation based on the model type: - For ensemble models (``_is_ensemble=True``): Uses ``ensemble_loo_cv`` which returns a ``GaussianMixturePosterior`` with both per-member and mixture statistics. - For standard GP models: Uses ``efficient_loo_cv`` which returns a ``GPyTorchPosterior`` with the LOO predictive distributions. Both implementations use efficient O(n³) matrix algebra rather than the naive O(n⁴) approach of refitting models for each fold. NOTE: This function does not refit the model to each LOO fold. The model hyperparameters are kept fixed, providing a fast approximation to full LOO CV. For models where hyperparameter changes are significant, consider using ``batch_cross_validation`` instead. NOTE: The ``untransform`` parameter defaults to True, which means results are returned in the original outcome space. Callers that previously relied on results in the model's internal (transformed) space should pass ``untransform=False`` explicitly. Args: model: A fitted GPyTorchModel. The model type determines which LOO CV implementation is used. observation_noise: If True (default), return the posterior predictive variance (including observation noise). If False, return the posterior variance of the latent function (excluding observation noise). The posterior variance is computed by subtracting the observation noise from the posterior predictive variance. untransform: If True (default), untransform the LOO predictions and observed values back to the original outcome space when the model has an outcome transform (e.g., ``Standardize``). This makes the results consistent with ``model.posterior()`` and ``batch_cross_validation``. If False, return results in the model's internal (transformed) space. Returns: CVResults: A named tuple containing: - model: The fitted GP model. - posterior: The LOO predictive distributions. For ensemble models, this is a ``GaussianMixturePosterior``; otherwise, it's a ``GPyTorchPosterior``. - observed_Y: The observed Y values. - observed_Yvar: The observed noise variances (if applicable). Example: >>> import torch >>> from botorch.cross_validation import loo_cv >>> from botorch.models import SingleTaskGP >>> from botorch.fit import fit_gpytorch_mll >>> from gpytorch.mlls import ExactMarginalLogLikelihood >>> >>> train_X = torch.rand(20, 2, dtype=torch.float64) >>> train_Y = torch.sin(train_X).sum(dim=-1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) >>> fit_gpytorch_mll(mll) >>> loo_results = loo_cv(model) >>> loo_results.posterior.mean.shape torch.Size([20, 1, 1]) See Also: - ``efficient_loo_cv``: Direct access to the standard GP implementation. - ``ensemble_loo_cv``: Direct access to the ensemble model implementation. - ``batch_cross_validation``: Full LOO CV with model refitting. """ loo_fun = ( ensemble_loo_cv if getattr(model, "_is_ensemble", False) else efficient_loo_cv ) return loo_fun(model, observation_noise=observation_noise, untransform=untransform)
[docs] def efficient_loo_cv( model: GPyTorchModel, observation_noise: bool = True, untransform: bool = True, ) -> CVResults: r"""Compute efficient Leave-One-Out cross-validation for a GP model. NOTE: This function does not refit the model to each LOO fold, in contrast to batch_cross_validation. This is a memory- and compute-efficient way to compute LOO, but it does not account for potential changes in the model parameters due to the removal of a single observation. This is typically ok in cases with a lot of data, but can result in substantial differences (typically over-estimating performance) in the low data regime. This function leverages a well-known linear algebraic identity to compute all LOO predictive distributions in O(n^3) time, compared to the naive approach which requires O(n^4) time (O(n^3) per fold for n folds). The efficient LOO formulas for GPs are: .. math:: \mu_{LOO,i} = y_i - \frac{[K^{-1}(y - \mu)]_i}{[K^{-1}]_{ii}} \sigma^2_{LOO,i} = \frac{1}{[K^{-1}]_{ii}} where K is the covariance matrix including observation noise. This gives the posterior predictive variance (including noise). To get the posterior variance (excluding noise), we subtract the observation noise: .. math:: \sigma^2_{posterior,i} = \sigma^2_{LOO,i} - \sigma^2_{noise} NOTE: This function assumes the model has already been fitted and that the model's ``forward`` method returns a ``MultivariateNormal`` distribution. Args: model: A fitted GPyTorchModel whose ``forward`` method returns a ``MultivariateNormal`` distribution. observation_noise: If True (default), return the posterior predictive variance (including observation noise). If False, return the posterior variance of the latent function (excluding observation noise). untransform: If True (default), untransform the LOO predictions and observed values back to the original outcome space when the model has an outcome transform (e.g., ``Standardize``). This makes the results consistent with ``model.posterior()`` and ``batch_cross_validation``. If False, return results in the model's internal (transformed) space. Returns: CVResults: A named tuple containing: - model: The fitted GP model. - posterior: The LOO predictive distributions (typically a ``GPyTorchPosterior``; with nonlinear outcome transforms like ``Log``, a ``TransformedPosterior``). The posterior mean and variance have shape ``n x 1 x m`` or ``batch_shape x n x 1 x m``, matching the structure of ``batch_cross_validation`` (n folds, 1 held-out point per fold, m outputs). The underlying distribution has diagonal covariance since LOO predictions at different held-out points are computed independently. - observed_Y: The observed Y values with shape ``n x 1 x m`` or ``batch_shape x n x 1 x m``. - observed_Yvar: The observed noise variances (if provided) with shape ``n x 1 x m`` or ``batch_shape x n x 1 x m``. Example: >>> import torch >>> from botorch.cross_validation import efficient_loo_cv >>> from botorch.models import SingleTaskGP >>> from botorch.fit import fit_gpytorch_mll >>> from gpytorch.mlls import ExactMarginalLogLikelihood >>> >>> train_X = torch.rand(20, 2, dtype=torch.float64) >>> train_Y = torch.sin(train_X).sum(dim=-1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) >>> mll = ExactMarginalLogLikelihood(model.likelihood, model) >>> fit_gpytorch_mll(mll) >>> loo_results = efficient_loo_cv(model) >>> loo_results.posterior.mean.shape torch.Size([20, 1, 1]) """ # Compute raw LOO predictions loo_mean, loo_variance, train_Y = _compute_loo_predictions( model, observation_noise=observation_noise ) # Get the number of outputs num_outputs = model.num_outputs # Build the posterior from raw LOO predictions posterior = _build_loo_posterior( loo_mean=loo_mean, loo_variance=loo_variance, num_outputs=num_outputs ) # Reshape observed data to LOO CV output format: n x 1 x m observed_Y = _reshape_to_loo_cv_format(train_Y, num_outputs) # Get observed Yvar if available (for fixed noise models) observed_Yvar = None if isinstance(model.likelihood, FixedNoiseGaussianLikelihood): observed_Yvar = _reshape_to_loo_cv_format(model.likelihood.noise, num_outputs) # Untransform predictions and observed values back to original space if untransform and hasattr(model, "outcome_transform"): posterior, observed_Y, observed_Yvar = _untransform_loo_results( model=model, posterior=posterior, observed_Y=observed_Y, observed_Yvar=observed_Yvar, ) return CVResults( model=model, posterior=posterior, observed_Y=observed_Y, observed_Yvar=observed_Yvar, )
def _untransform_loo_results( model: GPyTorchModel, posterior: Posterior, observed_Y: Tensor, observed_Yvar: Tensor | None, ) -> tuple[Posterior, Tensor, Tensor | None]: r"""Untransform LOO CV results from model-internal space to original space. Applies the model's outcome transform to map the LOO posterior and observed values back to the original (untransformed) outcome space. This uses ``outcome_transform.untransform_posterior`` for the posterior and ``outcome_transform.untransform`` for the observed values. For linear transforms like ``Standardize``, the posterior is analytically rescaled and remains a ``GPyTorchPosterior``. For nonlinear transforms like ``Log``, the posterior is wrapped in a ``TransformedPosterior``. Args: model: The GP model with an ``outcome_transform`` attribute. posterior: The LOO posterior in the model's internal (transformed) space. Shape: ``n x 1 x m`` or ``batch_shape x n x 1 x m``. observed_Y: The observed Y values in transformed space with shape ``n x 1 x m`` or ``batch_shape x n x 1 x m``. observed_Yvar: The observed noise variances in transformed space (if applicable) with shape ``n x 1 x m`` or ``batch_shape x n x 1 x m``. Returns: A tuple of (posterior, observed_Y, observed_Yvar) in the original (untransformed) outcome space. Note: if the outcome transform has a ``batch_shape`` (e.g., ``Standardize(batch_shape=[num_models])``), broadcasting during untransform may add leading dimensions to ``observed_Y`` that the caller must align with the posterior layout. """ outcome_transform = model.outcome_transform # Validate observed_Y shape before any transforms. if observed_Y.shape[-2] != 1: raise ValueError( "Expected observed_Y to have size 1 at dim -2 (q dimension), " f"got shape {observed_Y.shape}." ) posterior = outcome_transform.untransform_posterior(posterior) # Untransform observed_Y (and observed_Yvar if present). # observed_Y has shape n x 1 x m; Standardize.untransform expects # batch_shape x n x m. We squeeze the q=1 dim, untransform, and restore it. observed_Y_squeezed = observed_Y.squeeze(-2) observed_Yvar_squeezed = ( observed_Yvar.squeeze(-2) if observed_Yvar is not None else None ) observed_Y_utf, observed_Yvar_utf = outcome_transform.untransform( observed_Y_squeezed, observed_Yvar_squeezed ) observed_Y = observed_Y_utf.unsqueeze(-2) observed_Yvar = ( observed_Yvar_utf.unsqueeze(-2) if observed_Yvar_utf is not None else None ) return posterior, observed_Y, observed_Yvar def _likelihood_requires_X(likelihood: object) -> bool: r"""Check if a likelihood requires training inputs for noise computation. Returns True for likelihoods like SparseOutlierGaussianLikelihood that need training inputs to determine per-point noise (e.g., outlier variances). Standard GaussianLikelihood does not need training inputs. """ return isinstance(likelihood, SparseOutlierGaussianLikelihood) def _subtract_observation_noise(model: GPyTorchModel, loo_variance: Tensor) -> Tensor: r"""Subtract observation noise from LOO variance to get posterior variance. The efficient LOO formula computes the posterior predictive variance, which includes observation noise. To get the posterior variance of the latent function (without noise), we subtract the observation noise variance. This implementation uses the likelihood's ``forward`` method to extract noise variances in a general way. The ``forward`` method takes function samples and returns a distribution where the variance represents the observation noise. .. math:: \sigma^2_{posterior,i} = \sigma^2_{LOO,i} - \sigma^2_{noise} Args: model: The GP model with a likelihood containing the noise variance. loo_variance: The LOO posterior predictive variance with shape ``... x n x 1``. Returns: The posterior variance (without noise) with the same shape. """ likelihood = model.likelihood # Use the likelihood's forward method to extract noise variances. # By passing zeros as function samples, the returned distribution's # variance gives us the observation noise at each point. noise_shape = loo_variance.shape[:-1] # ... x n zeros = torch.zeros( noise_shape, dtype=loo_variance.dtype, device=loo_variance.device ) # Only pass training inputs when the likelihood needs them (e.g., # SparseOutlierGaussianLikelihood for per-point outlier noise). # Standard GaussianLikelihood doesn't need inputs, and passing # pre-transform inputs can cause shape mismatches with models that # use dimension-changing input transforms (e.g., AppendFeatures). if _likelihood_requires_X(likelihood): train_inputs = getattr(model, "train_inputs", None) noise_dist = likelihood.forward( zeros, train_inputs[0] if train_inputs else None ) else: noise_dist = likelihood.forward(zeros) # Extract noise variance and reshape to match loo_variance noise = noise_dist.variance.unsqueeze(-1) # ... x n x 1 loo_variance = loo_variance - noise # Clamp to ensure non-negative variance return loo_variance.clamp(min=0.0) def _compute_loo_predictions( model: GPyTorchModel, observation_noise: bool = True, ) -> tuple[Tensor, Tensor, Tensor]: r"""Compute raw LOO predictions (means and variances) for a GP model. This is an internal helper that computes the leave-one-out predictive means and variances using efficient matrix algebra. The formulas are: .. math:: \mu_{LOO,i} = y_i - \frac{[K^{-1}(y - \mu)]_i}{[K^{-1}]_{ii}} \sigma^2_{LOO,i} = \frac{1}{[K^{-1}]_{ii}} where K is the covariance matrix including observation noise and μ is the prior mean. This gives the posterior predictive variance (including noise). To get the posterior variance (excluding noise), we subtract the observation noise variance. Args: model: A fitted GPyTorchModel in eval mode whose ``forward`` method returns a ``MultivariateNormal`` distribution. observation_noise: If True (default), return the posterior predictive variance (including observation noise). If False, return the posterior variance of the latent function (excluding observation noise). Returns: A tuple of (loo_mean, loo_variance, train_Y) where: - loo_mean: LOO predictive means with shape ``... x n x 1`` - loo_variance: LOO predictive variances with shape ``... x n x 1`` - train_Y: The training targets from the model Raises: UnsupportedError: If the model doesn't have required attributes or the forward method doesn't return a MultivariateNormal. """ # Get training data - model should have train_inputs attribute if not hasattr(model, "train_inputs") or model.train_inputs is None: raise UnsupportedError( "Model must have train_inputs attribute for efficient LOO CV." ) if not hasattr(model, "train_targets") or model.train_targets is None: raise UnsupportedError( "Model must have train_targets attribute for efficient LOO CV." ) train_X = model.train_inputs[0] # Shape: n x d or batch_shape x n x d # Check for models with auxiliary inputs (e.g., auxiliary experiment data) # In such models, train_inputs[0] is a tuple of tensors rather than a single tensor if isinstance(train_X, tuple): raise UnsupportedError( "Efficient LOO CV is not supported for models with auxiliary inputs. " "train_inputs[0] is a tuple of tensors, indicating auxiliary data." ) train_Y = model.train_targets # Shape: n or batch_shape x n (batched outputs) n = train_X.shape[-2] prior_dist = model.forward(train_X) # Check that we got a MultivariateNormal if not isinstance(prior_dist, MultivariateNormal): raise UnsupportedError( f"Model's forward method must return a MultivariateNormal, " f"got {type(prior_dist).__name__}." ) # Extract mean from the prior # Shape: n for single-output, or m x n for batched multi-output mean = prior_dist.mean # Add observation noise to the diagonal via the likelihood # The likelihood adds noise: K_noisy = K + sigma^2 * I # Only pass training inputs when the likelihood needs them (e.g., # SparseOutlierGaussianLikelihood for per-point outlier noise). # Standard GaussianLikelihood doesn't need inputs, and passing # pre-transform inputs can cause shape mismatches with models that # use dimension-changing input transforms (e.g., AppendFeatures). if _likelihood_requires_X(model.likelihood): noisy_mvn = model.likelihood(prior_dist, model.train_inputs[0]) else: noisy_mvn = model.likelihood(prior_dist) # Get the covariance matrix - use lazy representation for potential caching K_noisy = noisy_mvn.lazy_covariance_matrix.to_dense() # Compute Cholesky decomposition (adds jitter if needed) L = psd_safe_cholesky(K_noisy) # Compute K^{-1}(y - mean) via Cholesky solve # Shape: ... x n x 1 where ... is batch_shape (includes m for multi-output) residuals = (train_Y - mean).unsqueeze(-1) K_inv_residuals = torch.cholesky_solve(residuals, L) # Compute diagonal of K^{-1} # K_inv = L^{-T} @ L^{-1}, so K_inv_diag[i] = sum_j (L^{-1}[j,i])^2 identity = torch.eye(n, dtype=L.dtype, device=L.device) if L.dim() > 2: identity = identity.expand(*L.shape[:-2], n, n) L_inv = torch.linalg.solve_triangular(L, identity, upper=False) K_inv_diag = (L_inv**2).sum(dim=-2) # ... x n # Compute LOO predictions using the efficient formulas: # sigma2_loo_i = 1 / [K^{-1}]_{ii} # mu_loo_i = y_i - [K^{-1}(y - mean)]_i / [K^{-1}]_{ii} # K_inv_diag has shape ... x n, so after unsqueeze(-1) we get ... x n x 1 # (the last dim is 1 because each GP is single-output). loo_variance = (1.0 / K_inv_diag).unsqueeze(-1) # ... x n x 1 loo_mean = train_Y.unsqueeze(-1) - K_inv_residuals * loo_variance # If we want the posterior (noiseless) variance, subtract the noise if not observation_noise: loo_variance = _subtract_observation_noise(model, loo_variance) return loo_mean, loo_variance, train_Y def _build_loo_posterior( loo_mean: Tensor, loo_variance: Tensor, num_outputs: int, ) -> GPyTorchPosterior: r"""Build a GPyTorchPosterior from raw LOO predictions. Args: loo_mean: LOO means with shape ``... x m x n x 1`` (multi-output) or ``... x n x 1`` (single-output), where ``...`` is optional batch_shape. loo_variance: LOO variances with same shape as loo_mean. num_outputs: Number of outputs (m). 1 for single-output models. Returns: A GPyTorchPosterior with shape ``... x n x 1 x m``. """ # Reshape tensors to final shape: ... x n x 1 x m if num_outputs > 1: # Multi-output: ... x m x n x 1 -> ... x n x 1 x m # The m dimension is at position -3, move it to position -1 loo_mean = loo_mean.movedim(-3, -1) loo_variance = loo_variance.movedim(-3, -1) else: # Single-output: ... x n x 1 -> ... x n x 1 x 1 loo_mean = loo_mean.unsqueeze(-1) loo_variance = loo_variance.unsqueeze(-1) # Create distribution: for multi-output use MTMVN, for single-output use MVN. # Both require mean shape ... x n x q (where q=1) and diagonal covariance. # We squeeze the m dimension to get ... x n x 1 for the MVN mean, then # iterate over outputs to create independent MVNs. mvns = [ MultivariateNormal( mean=loo_mean[..., t], covariance_matrix=DiagLinearOperator(loo_variance[..., t]), ) for t in range(num_outputs) ] if num_outputs > 1: mvn = MultitaskMultivariateNormal.from_independent_mvns(mvns=mvns) else: mvn = mvns[0] return GPyTorchPosterior(distribution=mvn) def _reshape_to_loo_cv_format(tensor: Tensor, num_outputs: int) -> Tensor: r"""Reshape a tensor to the standard LOO CV output format: ``n x 1 x m``. This helper converts tensors with internal model format (which varies by number of outputs) to the consistent output format used by LOO CV results. Args: tensor: Input tensor with shape: - Single-output: ``n`` (1D) - Multi-output: ``m x n`` (2D) num_outputs: Number of outputs (m). 1 for single-output models. Returns: Reshaped tensor with shape ``n x 1 x m``. """ if num_outputs > 1: # Multi-output: m x n -> n x m -> n x 1 x m return tensor.movedim(-2, -1).unsqueeze(-2) else: # Single-output: n -> n x 1 -> n x 1 x 1 return tensor.unsqueeze(-1).unsqueeze(-1)
[docs] def ensemble_loo_cv( model: GPyTorchModel, observation_noise: bool = True, untransform: bool = True, ) -> CVResults: r"""Compute efficient LOO cross-validation for ensemble models. This function computes Leave-One-Out cross-validation for ensemble models like ``SaasFullyBayesianSingleTaskGP``. For these models, the ``forward`` method returns a ``MultivariateNormal`` with a batch dimension containing statistics for all models in the ensemble. The LOO predictions from each ensemble member form a Gaussian mixture. This function returns a ``CVResults`` with a ``GaussianMixturePosterior`` that provides both per-member statistics (via ``posterior.mean`` and ``posterior.variance``) and aggregated mixture statistics (via ``posterior.mixture_mean`` and ``posterior.mixture_variance``). The mixture statistics are computed using the law of total variance: .. math:: \mu_{mix} = \frac{1}{K} \sum_{k=1}^{K} \mu_k \sigma^2_{mix} = \frac{1}{K} \sum_{k=1}^{K} \sigma^2_k + \frac{1}{K} \sum_{k=1}^{K} \mu_k^2 - \mu_{mix}^2 where K is the number of ensemble members. NOTE: This function assumes the model has already been fitted (e.g., using ``fit_fully_bayesian_model_nuts``) and that the model is an ensemble model with ``_is_ensemble = True``. Args: model: An ensemble GPyTorchModel (e.g., SaasFullyBayesianSingleTaskGP) whose ``forward`` method returns a ``MultivariateNormal`` distribution with a batch dimension for ensemble members. observation_noise: If True (default), return the posterior predictive variance (including observation noise). If False, return the posterior variance of the latent function (excluding observation noise). untransform: If True (default), untransform the LOO predictions and observed values back to the original outcome space when the model has an outcome transform (e.g., ``Standardize``). This makes the results consistent with ``model.posterior()`` and ``batch_cross_validation``. If False, return results in the model's internal (transformed) space. Returns: CVResults: A named tuple containing: - model: The fitted ensemble GP model. - posterior: A ``GaussianMixturePosterior`` with per-member shape ``n x num_models x 1 x m``. Access per-member statistics via ``posterior.mean`` and ``posterior.variance``, and mixture statistics via ``posterior.mixture_mean`` and ``posterior.mixture_variance``. - observed_Y: The observed Y values with shape ``n x num_models x 1 x m``, matching the posterior layout so that element-wise operations (e.g., ``posterior.mean - observed_Y``) work correctly. - observed_Yvar: The observed noise variances (if provided) with the same shape as ``observed_Y``. Example: >>> import torch >>> from botorch.cross_validation import ensemble_loo_cv >>> from botorch.models.fully_bayesian import SaasFullyBayesianSingleTaskGP >>> from botorch.models.fully_bayesian import fit_fully_bayesian_model_nuts >>> >>> train_X = torch.rand(20, 2, dtype=torch.float64) >>> train_Y = torch.sin(train_X).sum(dim=-1, keepdim=True) >>> model = SaasFullyBayesianSingleTaskGP(train_X, train_Y) >>> fit_fully_bayesian_model_nuts(model, warmup_steps=64, num_samples=32) >>> loo_results = ensemble_loo_cv(model) >>> loo_results.posterior.mean.shape # Per-member means torch.Size([20, 32, 1, 1]) >>> loo_results.posterior.mixture_mean.shape # Aggregated mixture mean torch.Size([20, 1, 1]) """ # Check that this is an ensemble model if not getattr(model, "_is_ensemble", False): raise UnsupportedError( "ensemble_loo_cv requires an ensemble model (with _is_ensemble=True). " f"Got model of type {type(model).__name__}. " "For non-ensemble models, use efficient_loo_cv instead." ) # Compute raw LOO predictions # For ensemble models, shapes are: num_models x n x 1 loo_mean, loo_variance, train_Y = _compute_loo_predictions( model, observation_noise=observation_noise ) # Validate that we have the expected batch dimension for ensemble if loo_mean.dim() < 3: raise UnsupportedError( "Expected ensemble model to produce batched LOO results with shape " f"(batch_shape x num_models x n x 1), but got shape {loo_mean.shape}." ) # Get the number of outputs num_outputs = model.num_outputs # Build the GaussianMixturePosterior posterior = _build_ensemble_loo_posterior( loo_mean=loo_mean, loo_variance=loo_variance, num_outputs=num_outputs ) # Extract observed data (first ensemble member) and reshape to LOO CV format observed_Y, observed_Yvar = _get_ensemble_observed_data( model=model, train_Y=train_Y, num_outputs=num_outputs ) # Untransform predictions and observed values back to original space if untransform and hasattr(model, "outcome_transform"): observed_Y_ndim = observed_Y.dim() posterior, observed_Y, observed_Yvar = _untransform_loo_results( model=model, posterior=posterior, observed_Y=observed_Y, observed_Yvar=observed_Yvar, ) # After untransform with a batch-shaped outcome transform (e.g., # Standardize(batch_shape=[num_models])), observed_Y gains leading # batch dimensions from the transform. For example, with # batch_shape=[num_models], observed_Y goes from [n, 1, m] to # [num_models, n, 1, m]. The posterior has num_models at MCMC_DIM=-3, # giving shape [n, num_models, 1, m]. Align observed_Y by moving all # added leading dims to just before the q=1 dim (MCMC_DIM position). n_added = observed_Y.dim() - observed_Y_ndim if n_added > 1: raise UnsupportedError( "ensemble_loo_cv does not support outcome transforms that add " f"more than 1 batch dimension. Got {n_added} added dimensions " f"(shape went from {observed_Y_ndim}D to {observed_Y.dim()}D)." ) if n_added == 1: observed_Y = observed_Y.movedim(0, -3) if observed_Yvar is not None: observed_Yvar = observed_Yvar.movedim(0, -3) # Ensure observed_Y has the num_models dimension at MCMC_DIM=-3 to match # the posterior shape [n, num_models, 1, m]. Without this, element-wise # operations like `posterior.mean - observed_Y` would fail due to # incompatible broadcasting (3D [n, 1, m] vs 4D [n, num_models, 1, m]). posterior_mean = posterior.mean if observed_Y.dim() < posterior_mean.dim(): observed_Y = observed_Y.unsqueeze(-3).expand_as(posterior_mean) if observed_Yvar is not None: observed_Yvar = observed_Yvar.unsqueeze(-3).expand_as(posterior_mean) return CVResults( model=model, posterior=posterior, observed_Y=observed_Y, observed_Yvar=observed_Yvar, )
def _build_ensemble_loo_posterior( loo_mean: Tensor, loo_variance: Tensor, num_outputs: int, ) -> GaussianMixturePosterior: r"""Build a GaussianMixturePosterior from raw ensemble LOO predictions. This function takes raw LOO means and variances from an ensemble model (computed by ``_compute_loo_predictions``) and packages them into a GaussianMixturePosterior that provides both per-member and mixture statistics. Args: loo_mean: LOO means with shape ``batch_shape x num_models x n x 1`` (single-output) or ``batch_shape x num_models x m x n x 1`` (multi-output). loo_variance: LOO variances with same shape as loo_mean. num_outputs: Number of outputs (m). 1 for single-output models. Returns: GaussianMixturePosterior with shape ``batch_shape x n x num_models x 1 x m``. The num_models dimension is at MCMC_DIM=-3. """ # Normalize shapes: add m=1 dimension for single-output to match multi-output if num_outputs == 1: # Single-output: ... x num_models x n x 1 -> ... x num_models x 1 x n x 1 loo_mean = loo_mean.unsqueeze(-3) loo_variance = loo_variance.unsqueeze(-3) # Now both cases have shape: ... x num_models x m x n x 1 # Transform to target shape: ... x n x num_models x 1 x m # 1. squeeze(-1): ... x num_models x m x n # 2. movedim(-1, -3): ... x n x num_models x m (move n before num_models) # 3. unsqueeze(-2): ... x n x num_models x 1 x m loo_mean = loo_mean.squeeze(-1).movedim(-1, -3).unsqueeze(-2) loo_variance = loo_variance.squeeze(-1).movedim(-1, -3).unsqueeze(-2) # Create distribution: iterate over outputs to create independent MVNs # After indexing with [..., t], shape is: batch_shape x n x num_models x 1 mvns = [ MultivariateNormal( mean=loo_mean[..., t], covariance_matrix=DiagLinearOperator(loo_variance[..., t]), ) for t in range(num_outputs) ] if num_outputs > 1: mvn = MultitaskMultivariateNormal.from_independent_mvns(mvns=mvns) else: mvn = mvns[0] return GaussianMixturePosterior(distribution=mvn) def _get_ensemble_observed_data( model: GPyTorchModel, train_Y: Tensor, num_outputs: int, ) -> tuple[Tensor, Tensor | None]: r"""Extract observed data from an ensemble model for LOO CV. Extracts the first ensemble member's training targets and observation noise, verifies all members share the same data, and reshapes to LOO CV format. Args: model: The ensemble GP model. train_Y: Training targets with shape ``... x num_models x n`` (single-output) or ``... x num_models x m x n`` (multi-output). num_outputs: Number of outputs (m). Returns: (observed_Y, observed_Yvar) with shape ``... x n x 1 x m``. Raises: UnsupportedError: If ensemble members have different training data. """ # num_models is at dim -2 for single-output, -3 for multi-output num_models_dim = -2 if num_outputs == 1 else -3 # Verify all ensemble members share the same training data _verify_ensemble_data_consistency(train_Y, num_models_dim, "train_Y") # Extract first ensemble member's data (they're all the same) train_Y_first = train_Y.select(num_models_dim, 0) observed_Y = _reshape_to_loo_cv_format(train_Y_first, num_outputs) # Get observed Yvar if available (for fixed noise models) observed_Yvar = None if isinstance(model.likelihood, FixedNoiseGaussianLikelihood): noise = model.likelihood.noise # Noise has the same shape structure as train_Y # Verify consistency and extract first member if noise.dim() > 1: _verify_ensemble_data_consistency( noise, num_models_dim, "observation noise" ) noise = noise.select(num_models_dim, 0) observed_Yvar = _reshape_to_loo_cv_format(noise, num_outputs) return observed_Y, observed_Yvar def _verify_ensemble_data_consistency( tensor: Tensor, num_models_dim: int, tensor_name: str, ) -> None: r"""Verify all ensemble members have identical data along ``num_models_dim``. Args: tensor: Data tensor with a num_models dimension. num_models_dim: Dimension index for num_models (typically -2 or -3). tensor_name: Name for error messages (e.g., "train_Y"). Raises: UnsupportedError: If data differs across ensemble members. """ num_models = tensor.shape[num_models_dim] if num_models <= 1: return first_member = tensor.select(num_models_dim, 0) first_expanded = first_member.unsqueeze(num_models_dim).expand_as(tensor) if not torch.equal(tensor, first_expanded): raise UnsupportedError( f"Ensemble members have different {tensor_name}. " "ensemble_loo_cv only supports ensembles where all members share the " "same training data (e.g., fully Bayesian models with MCMC samples). " "For ensembles with different data per member, cross-validate each " "member individually using efficient_loo_cv." )