Source code for torchsparsegradutils.utils.minres

# MIT-licensed code imported from https://github.com/cornellius-gp/linear_operator
# Minor modifications for torchsparsegradutils to remove dependencies

from typing import Callable, NamedTuple, Optional, Union

import torch



[docs]
class MINRESSettings(NamedTuple):
    max_cg_iterations: int = 1000  # The maximum number of conjugate gradient iterations to perform (when computing
    # matrix solves). A higher value rarely results in more accurate solves -- instead, lower the CG tolerance.
    minres_tolerance: float = 1e-4  # Relative update term tolerance to use for terminating MINRES.
    verbose_linalg: bool = False  # Print out information whenever running an expensive linear algebra routine



def _pad_with_singletons(obj, num_singletons_before=0, num_singletons_after=0):
    """
    Pad obj with singleton dimensions on the left and right
    Example:
        >>> x = torch.randn(10, 5)
        >>> _pad_with_singletons(x, 2, 3).shape
        torch.Size([1, 1, 10, 5, 1, 1, 1])
    """
    new_shape = [1] * num_singletons_before + list(obj.shape) + [1] * num_singletons_after
    return obj.view(*new_shape)



[docs]
def minres(
    matmul_closure: Union[torch.Tensor, Callable[[torch.Tensor], torch.Tensor]],
    rhs: torch.Tensor,
    eps: float = 1e-25,
    shifts: Optional[torch.Tensor] = None,
    value: Optional[float] = None,
    max_iter: Optional[int] = None,
    preconditioner: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
    settings: MINRESSettings = MINRESSettings(),
) -> torch.Tensor:
    """
    Minimum Residual (MINRES) solver for symmetric (Hermitian) linear systems.

    Solves linear systems ``A x = b`` where ``A`` is symmetric (Hermitian) and may
    be indefinite. Supports single/multiple right-hand sides and (optionally)
    multiple shift values to solve ``(A + \\sigma I) x = b`` in one run. Gradually
    minimizes the residual norm ``||A x - b||_2`` via the Lanczos process.

    Parameters
    ----------
    matmul_closure : {torch.Tensor, callable(x) -> A @ x}
        Matrix–vector multiplication operator. If a tensor is provided, its
        ``.matmul`` is used. The operator should represent a symmetric/Hermitian
        matrix for MINRES to behave as intended.
    rhs : torch.Tensor, shape (..., n) or (..., n, k)
        Right-hand side vector(s). Leading batch dimensions are supported; for
        multi-RHS, the last two dims are ``(n, k)``.
    eps : float, optional
        Small constant to prevent division by zero/numerical issues. Default: 1e-25.
    shifts : torch.Tensor or scalar, optional
        Shift(s) ``\\sigma`` for solving ``(A + \\sigma I) x = b``. If ``None`` or a
        scalar, a single system is solved. If a tensor with ``s`` elements, the
        solver computes ``s`` shifted systems and stacks their solutions along a
        new leading dimension.
    value : float, optional
        Scalar multiplier ``\\alpha`` applied to the operator (solves ``(\\alpha A) x = b``)
        when provided. Default: ``None`` (no scaling).
    max_iter : int, optional
        Maximum iterations. If ``None``, uses ``settings.max_cg_iterations``.
        Internally capped at ``n + 1`` where ``n`` is the problem size.
    preconditioner : callable, optional
        Left preconditioner with signature ``preconditioner(x) -> M^{-1} x``.
        If ``None``, no preconditioning is used.
    settings : MINRESSettings, optional
        Configuration object controlling iteration caps and tolerances
        (e.g., ``minres_tolerance`` for the relative update criterion).

    Returns
    -------
    torch.Tensor
        If ``shifts`` is ``None`` or a scalar: solution with the **same shape as**
        ``rhs`` (i.e., ``(..., n)`` or ``(..., n, k)``).
        If ``shifts`` has length ``s``: a stacked tensor of shape
        ``(s, *rhs.shape)`` containing solutions for each shift.

    Raises
    ------
    RuntimeError
        If ``matmul_closure`` is neither a tensor nor a callable.

    Notes
    -----
    - MINRES [1g]_ is appropriate for symmetric/Hermitian **indefinite** systems; it
      minimizes the Euclidean residual norm rather than the A-norm (as in CG).
    - For symmetric positive definite systems, Conjugate Gradient (CG) typically
      converges faster; prefer CG unless indefiniteness/robustness suggests MINRES.
    - When multiple shifts are provided, the solver reuses Lanczos information and
      returns one solution per shift value.
    - All inputs should share device and dtype; the implementation normalizes
      ``rhs`` internally and rescales the final solution(s).

    See Also
    --------
    linear_cg : Conjugate Gradient for SPD systems.
    bicgstab : BiCGSTAB for general non-symmetric systems.

    References
    ----------
    .. [1g] Paige, C. C., & Saunders, M. A. (1975). Solution of sparse indefinite
           systems of linear equations. *SIAM Journal on Numerical Analysis*, 12(4), 617–629.

    Examples
    --------
    Basic solve (indefinite, symmetric):

    >>> A = torch.tensor([[2.0, 1.0], [1.0, -1.0]])
    >>> b = torch.tensor([1.0, 2.0])
    >>> x = minres(A.matmul, b)
    >>> x.shape
    torch.Size([2])

    Multiple right-hand sides:

    >>> B = torch.randn(2, 3)
    >>> X = minres(A.matmul, B)
    >>> X.shape
    torch.Size([2, 3])

    Shifted system (regularization):

    >>> x_shifted = minres(A.matmul, b, shifts=torch.tensor(0.1))

    Sparse operator via closure:

    >>> idx = torch.tensor([[0, 0, 1, 1], [0, 1, 0, 1]])
    >>> val = torch.tensor([2.0, 1.0, 1.0, -1.0])
    >>> A_sp = torch.sparse_coo_tensor(idx, val, (2, 2))
    >>> x = minres(lambda v: A_sp @ v, b)

    With a simple diagonal preconditioner:

    >>> M_diag = torch.abs(torch.diag(A)) + 0.1
    >>> precond = lambda x: x / M_diag.unsqueeze(-1)
    >>> x = minres(A.matmul, b, preconditioner=precond)

    Custom iteration cap/tolerance:

    >>> settings = MINRESSettings(max_cg_iterations=200, minres_tolerance=1e-5)
    >>> x = minres(A.matmul, b, settings=settings)
    """
    # Default values
    if torch.is_tensor(matmul_closure):
        matmul_closure = matmul_closure.matmul
    mm_ = matmul_closure
    if preconditioner is None:
        preconditioner = lambda x: x.clone()

    if shifts is None:
        shifts = torch.tensor(0.0, dtype=rhs.dtype, device=rhs.device)

    # Scale the rhs
    squeeze = False
    if rhs.dim() == 1:
        rhs = rhs.unsqueeze(-1)
        squeeze = True

    rhs_norm = rhs.norm(2, dim=-2, keepdim=True)
    rhs_is_zero = rhs_norm.lt(1e-10)
    rhs_norm = rhs_norm.masked_fill_(rhs_is_zero, 1)
    rhs = rhs.div(rhs_norm)

    # Use the right number of iterations
    if max_iter is None:
        max_iter = settings.max_cg_iterations
    max_iter = min(max_iter, rhs.size(-2) + 1)

    # Epsilon (to prevent nans)
    eps = torch.tensor(eps, dtype=rhs.dtype, device=rhs.device)

    # Create space for matmul product, solution
    prod = mm_(rhs)
    if value is not None:
        prod.mul_(value)

    # Resize shifts
    shifts = _pad_with_singletons(shifts, 0, prod.dim() - shifts.dim() + 1)
    solution = torch.zeros(shifts.shape[:1] + prod.shape, dtype=rhs.dtype, device=rhs.device)

    # Variables for Lanczos terms
    zvec_prev2 = torch.zeros_like(prod)
    zvec_prev1 = rhs.clone().expand_as(prod).contiguous()
    qvec_prev1 = preconditioner(zvec_prev1)
    alpha_curr = torch.empty(prod.shape[:-2] + (1, prod.size(-1)), dtype=rhs.dtype, device=rhs.device)
    alpha_shifted_curr = torch.empty(solution.shape[:-2] + (1, prod.size(-1)), dtype=rhs.dtype, device=rhs.device)
    beta_prev = (zvec_prev1 * qvec_prev1).sum(dim=-2, keepdim=True).sqrt_()
    beta_curr = torch.empty_like(beta_prev)
    tmpvec = torch.empty_like(qvec_prev1)

    # Divide by beta_prev
    zvec_prev1.div_(beta_prev)
    qvec_prev1.div_(beta_prev)

    # Variables for the QR rotation
    # 1) Components of the Givens rotations
    cos_prev2 = torch.ones(solution.shape[:-2] + (1, rhs.size(-1)), dtype=rhs.dtype, device=rhs.device)
    sin_prev2 = torch.zeros(solution.shape[:-2] + (1, rhs.size(-1)), dtype=rhs.dtype, device=rhs.device)
    cos_prev1 = torch.ones_like(cos_prev2)
    sin_prev1 = torch.zeros_like(sin_prev2)
    radius_curr = torch.empty_like(cos_prev1)
    cos_curr = torch.empty_like(cos_prev1)
    sin_curr = torch.empty_like(cos_prev1)
    # 2) Terms QR decomposition of T
    subsub_diag_term = torch.empty_like(alpha_shifted_curr)
    sub_diag_term = torch.empty_like(alpha_shifted_curr)
    diag_term = torch.empty_like(alpha_shifted_curr)

    # Variables for the solution updates
    # 1) The "search" vectors of the solution
    # Equivalent to the vectors of Q R^{-1}, where Q is the matrix of Lanczos vectors and
    # R is the QR factor of the tridiagonal Lanczos matrix.
    search_prev2 = torch.zeros_like(solution)
    search_prev1 = torch.zeros_like(solution)
    search_curr = torch.empty_like(search_prev1)
    search_update = torch.empty_like(search_prev1)
    # 2) The "scaling" terms of the search vectors
    # Equivalent to the terms of V^T Q^T rhs, where Q is the matrix of Lanczos vectors and
    # V is the QR orthonormal of the tridiagonal Lanczos matrix.
    scale_prev = beta_prev.repeat(shifts.size(0), *([1] * beta_prev.dim()))
    scale_curr = torch.empty_like(scale_prev)

    # Terms for checking for convergence
    solution_norm = torch.zeros(*solution.shape[:-2], solution.size(-1), dtype=solution.dtype, device=solution.device)
    search_update_norm = torch.zeros_like(solution_norm)

    # Maybe log
    if settings.verbose_linalg:
        # settings.verbose_linalg.logger.debug(
        print(
            f"Running MINRES on a {rhs.shape} RHS for {max_iter} iterations (tol={settings.minres_tolerance}). "
            f"Output: {solution.shape}."
        )

    # Perform iterations
    for i in range(max_iter + 2):
        # Perform matmul
        prod = mm_(qvec_prev1)
        if value is not None:
            prod.mul_(value)

        # Get next Lanczos terms
        # --> alpha_curr, beta_curr, qvec_curr
        torch.mul(prod, qvec_prev1, out=tmpvec)
        torch.sum(tmpvec, -2, keepdim=True, out=alpha_curr)

        zvec_curr = prod.addcmul_(alpha_curr, zvec_prev1, value=-1).addcmul_(beta_prev, zvec_prev2, value=-1)

        qvec_curr = preconditioner(zvec_curr)
        torch.mul(zvec_curr, qvec_curr, out=tmpvec)
        torch.sum(tmpvec, -2, keepdim=True, out=beta_curr)
        beta_curr.sqrt_()
        beta_curr.clamp_min_(eps)

        zvec_curr.div_(beta_curr)
        qvec_curr.div_(beta_curr)

        # Perform JIT-ted update
        conv = _jit_minres_updates(
            solution,
            shifts,
            eps,
            qvec_prev1,
            alpha_curr,
            alpha_shifted_curr,
            beta_prev,
            beta_curr,
            cos_prev2,
            cos_prev1,
            cos_curr,
            sin_prev2,
            sin_prev1,
            sin_curr,
            radius_curr,
            subsub_diag_term,
            sub_diag_term,
            diag_term,
            search_prev2,
            search_prev1,
            search_curr,
            search_update,
            scale_prev,
            scale_curr,
            search_update_norm,
            solution_norm,
        )

        # Check convergence criterion
        if (i + 1) % 10 == 0:
            torch.norm(search_update, dim=-2, out=search_update_norm)
            torch.norm(solution, dim=-2, out=solution_norm)
            conv = search_update_norm.div_(solution_norm).mean().item()
            if conv < settings.minres_tolerance:
                break

        # Update terms for next iteration
        # Lanczos terms
        zvec_prev2, zvec_prev1 = zvec_prev1, prod
        qvec_prev1 = qvec_curr
        beta_prev, beta_curr = beta_curr, beta_prev
        # Givens rotations terms
        cos_prev2, cos_prev1, cos_curr = cos_prev1, cos_curr, cos_prev2
        sin_prev2, sin_prev1, sin_curr = sin_prev1, sin_curr, sin_prev2
        # Search vector terms)
        search_prev2, search_prev1, search_curr = search_prev1, search_curr, search_prev2
        scale_prev, scale_curr = scale_curr, scale_prev

    # For rhs-s that are close to zero, set them to zero
    solution.masked_fill_(rhs_is_zero, 0)

    if squeeze:
        solution = solution.squeeze(-1)
        rhs = rhs.squeeze(-1)
        rhs_norm = rhs_norm.squeeze(-1)

    if shifts.numel() == 1:
        # If we weren't shifting we shouldn't return a batch output
        solution = solution.squeeze(0)

    return solution.mul_(rhs_norm)



def _jit_minres_updates(
    solution,
    shifts,
    eps,
    qvec_prev1,
    alpha_curr,
    alpha_shifted_curr,
    beta_prev,
    beta_curr,
    cos_prev2,
    cos_prev1,
    cos_curr,
    sin_prev2,
    sin_prev1,
    sin_curr,
    radius_curr,
    subsub_diag_term,
    sub_diag_term,
    diag_term,
    search_prev2,
    search_prev1,
    search_curr,
    search_update,
    scale_prev,
    scale_curr,
    search_update_norm,
    solution_norm,
):
    # Start givens rotation
    # Givens rotation from 2 steps ago
    torch.mul(sin_prev2, beta_prev, out=subsub_diag_term)
    torch.mul(cos_prev2, beta_prev, out=sub_diag_term)

    # Compute shifted alpha
    torch.add(alpha_curr, shifts, out=alpha_shifted_curr)

    # Givens rotation from 1 step ago
    torch.mul(alpha_shifted_curr, cos_prev1, out=diag_term).addcmul_(sin_prev1, sub_diag_term, value=-1)
    sub_diag_term.mul_(cos_prev1).addcmul_(sin_prev1, alpha_shifted_curr)

    # 3) Compute next Givens terms
    torch.mul(diag_term, diag_term, out=radius_curr).addcmul_(beta_curr, beta_curr).sqrt_()
    cos_curr = torch.div(diag_term, radius_curr, out=cos_curr)
    sin_curr = torch.div(beta_curr, radius_curr, out=sin_curr)
    # 4) Apply current Givens rotation
    diag_term.mul_(cos_curr).addcmul_(sin_curr, beta_curr)

    # Update the solution
    # --> search_curr, scale_curr solution
    # 1) Apply the latest Givens rotation to the Lanczos-rhs ( ||rhs|| e_1 )
    # This is getting the scale terms for the "search" vectors
    torch.mul(scale_prev, sin_curr, out=scale_curr).mul_(-1)
    scale_prev.mul_(cos_curr)
    # 2) Get the new search vector
    torch.addcmul(qvec_prev1, sub_diag_term, search_prev1, value=-1, out=search_curr)
    search_curr.addcmul_(subsub_diag_term, search_prev2, value=-1)
    search_curr.div_(diag_term)

    # 3) Update the solution
    torch.mul(search_curr, scale_prev, out=search_update)
    solution.add_(search_update)