Source code for plenoptic.metric._perceptual_distance

"""
Metrics designed to model human perceptual distance.

Metrics that model human perceptual distance seek to answer the question "how different
do humans find these two images?".
"""  # numpydoc ignore=EX01

import warnings
from typing import Literal

import torch
import torch.nn.functional as F

from ..process import normalized_laplacian_pyramid
from ..process.metric import _ssim_parts

__all__ = [
    "ms_ssim",
    "nlpd",
    "ssim",
]


def __dir__() -> list[str]:
    return __all__



[docs]
def ssim(
    img1: torch.Tensor,
    img2: torch.Tensor,
    weighted: bool = False,
    pad: Literal[False, "constant", "reflect", "replicate", "circular"] = False,
) -> torch.Tensor:
    r"""
    Compute the structural similarity index.

    As described in Wang et al., 2004 [1]_, the structural similarity index (SSIM) is a
    perceptual distance metric, giving the distance between two images. SSIM is based on
    three comparison measurements between the two images: luminance, contrast, and
    structure. All of these are computed convolutionally across the images. See the
    references for more information.

    This implementation follows the original implementation, as found online [2]_, as
    well as providing the option to use the weighted version used in Wang and
    Simoncelli, 2008 [4]_ (which was shown to consistently improve the image quality
    prediction on the LIVE database). More info can be found online [3]_.

    Note that this is a similarity metric (not a distance), and so 1 means the
    two images are identical and 0 means they're very different. When the two
    images are negatively correlated, SSIM can be negative. SSIM is bounded
    between -1 and 1.

    This function returns the mean SSIM, a scalar-valued metric giving the
    average over the whole image. For the SSIM map (showing the computed value
    across the image), call :func:`~plenoptic.process.ssim_map`.

    Parameters
    ----------
    img1
        The first image or batch of images, of shape (batch, channel, height, width).
    img2
        The second image or batch of images, of shape (batch, channel, height, width).
        The heights and widths of ``img1`` and ``img2`` must be the same. The numbers of
        batches and channels of ``img1`` and ``img2`` need to be broadcastable: either
        they are the same or one of them is 1. The output will be computed separately
        for each channel (so channels are treated in the same way as batches). Both
        images should have values between 0 and 1. Otherwise, the result may be
        inaccurate, and we will raise a warning (but will still compute it).
    weighted
        Whether to use the original, unweighted SSIM version (``False``) as used
        in [1]_ or the weighted version (``True``) as used in [4]_. See Notes
        section for the weight.
    pad :
        If not ``False``, how to pad the image for the convolutions computing the
        local average of each image. See :func:`torch.nn.functional.pad` for how
        these work.

    Returns
    -------
    mssim
        2d tensor of shape (batch, channel) containing the mean SSIM for each
        image, averaged over the whole image.

    Raises
    ------
    ValueError
        If either ``img1`` or ``img2`` is not 4d.
    ValueError
        If ``img1`` and ``img2`` have different height or width.
    ValueError
        If ``img1`` and ``img2`` have different batch or channel, unless one of them has
        a 1 there, so they can be broadcast.
    ValueError
        If ``img1`` and ``img2`` have different dtypes.

    Warns
    -----
    UserWarning
        If either ``img1`` or ``img2`` has multiple channels, as SSIM was designed for
        grayscale images.
    UserWarning
        If at least one scale from either ``img1`` or ``img2`` has height or width of
        less than 11, since SSIM uses an 11x11 convolutional kernel.

    Notes
    -----
    The weight used when ``weighted=True`` is:

    .. math::
       \log((1+\frac{\sigma_1^2}{C_2})(1+\frac{\sigma_2^2}{C_2}))

    where :math:`\sigma_1^2` and :math:`\sigma_2^2` are the variances of ``img1``
    and ``img2``, respectively, and :math:`C_2` is a constant. See [4]_ for more
    details.

    References
    ----------
    .. [1] Z. Wang, A. C. Bovik, H. R. Sheikh, and E. P. Simoncelli, "Image
       quality assessment: From error measurement to structural similarity"
       IEEE Transactions on Image Processing, vol. 13, no. 1, Jan. 2004.
    .. [2] matlab code: `<https://www.cns.nyu.edu/~lcv/ssim/ssim_index.m>`_
    .. [3] project page: `<https://www.cns.nyu.edu/~lcv/ssim/>`_
    .. [4] Wang, Z., & Simoncelli, E. P. (2008). Maximum differentiation (MAD)
       competition: A methodology for comparing computational models of
       perceptual discriminability. Journal of Vision, 8(12), 1–13.
       https://dx.doi.org/10.1167/8.12.8

    Examples
    --------
    >>> import plenoptic as po
    >>> import torch
    >>> po.set_seed(0)
    >>> img = po.data.einstein()
    >>> po.metric.ssim(img, img + torch.rand_like(img))
    tensor([[0.0519]])
    """
    # these are named map_ssim instead of the perhaps more natural ssim_map
    # because that's the name of a function
    map_ssim, _, weight = _ssim_parts(img1, img2, pad)
    if not weighted:
        mssim = map_ssim.mean((-1, -2))
    else:
        mssim = (map_ssim * weight).sum((-1, -2)) / weight.sum((-1, -2))

    if min(img1.shape[2], img1.shape[3]) < 11:
        warnings.warn(
            "SSIM uses 11x11 convolutional kernel, but the height and/or "
            "the width of the input image is smaller than 11, so the "
            "kernel size is set to be the minimum of these two numbers."
        )
    return mssim




[docs]
def ms_ssim(
    img1: torch.Tensor, img2: torch.Tensor, power_factors: torch.Tensor | None = None
) -> torch.Tensor:
    r"""
    Multiscale structural similarity index (MS-SSIM).

    As described in Wang et al., 2003 [9]_, multiscale structural similarity index
    (MS-SSIM) is an improvement upon structural similarity index (SSIM) that takes into
    account the perceptual distance between two images on different scales.

    SSIM is based on three comparison measurements between the two images:
    luminance, contrast, and structure. All of these are computed convolutionally
    across the images, producing three maps instead of scalars. The SSIM map is
    the elementwise product of these three maps. See :func:`ssim` and
    :func:`~plenoptic.process.ssim_map` for a full description of SSIM.

    To get images of different scales, average pooling operations with kernel
    size 2 are performed recursively on the input images. The product of
    contrast map and structure map (the "contrast-structure map") is computed
    for all but the coarsest scales, and the overall SSIM map is only computed
    for the coarsest scale. Their mean values are raised to exponents and
    multiplied to produce MS-SSIM:

    .. math::
        MSSSIM = {SSIM}_M^{a_M} \prod_{i=1}^{M-1} ({CS}_i)^{a_i}

    Here :math:`M` is the number of scales, :math:`{CS}_i` is the mean value
    of the contrast-structure map for the i'th finest scale, and :math:`{SSIM}_M`
    is the mean value of the SSIM map for the coarsest scale. If at least one
    of these terms are negative, the value of MS-SSIM is zero. The values of
    :math:`a_i, i=1,...,M` are taken from the argument ``power_factors``.

    Parameters
    ----------
    img1
        The first image or batch of images, of shape (batch, channel, height, width).
    img2
        The second image or batch of images, of shape (batch, channel, height, width).
        The heights and widths of ``img1`` and ``img2`` must be the same. The numbers of
        batches and channels of ``img1`` and ``img2`` need to be broadcastable: either
        they are the same or one of them is 1. The output will be computed separately
        for each channel (so channels are treated in the same way as batches). Both
        images should have values between 0 and 1. Otherwise, the result may be
        inaccurate, and we will raise a warning (but will still compute it).
    power_factors
        Power exponents for the mean values of maps, for different scales (from
        fine to coarse). The length of this array determines the number of scales.
        If ``None``, set to ``[0.0448, 0.2856, 0.3001, 0.2363, 0.1333]``, which is what
        psychophysical experiments in Wang et al., 2003 [9]_ found.

    Returns
    -------
    msssim
        2d tensor of shape (batch, channel) containing the MS-SSIM for each image.

    Raises
    ------
    ValueError
        If either ``img1`` or ``img2`` is not 4d.
    ValueError
        If ``img1`` and ``img2`` have different height or width.
    ValueError
        If ``img1`` and ``img2`` have different batch or channel, unless one of them has
        a 1 there, so they can be broadcast.
    ValueError
        If ``img1`` and ``img2`` have different dtypes.

    Warns
    -----
    UserWarning
        If either ``img1`` or ``img2`` has multiple channels, as MS-SSIM was designed
        for grayscale images.
    UserWarning
        If at least one scale from either ``img1`` or ``img2`` has height or width of
        less than 11, since SSIM uses an 11x11 convolutional kernel.

    References
    ----------
    .. [9] Wang, Zhou, Eero P. Simoncelli, and Alan C. Bovik. "Multiscale
       structural similarity for image quality assessment." The Thrity-Seventh
       Asilomar Conference on Signals, Systems & Computers, 2003. Vol. 2. IEEE, 2003.

    Examples
    --------
    >>> import plenoptic as po
    >>> import torch
    >>> po.set_seed(0)
    >>> img = po.data.einstein()
    >>> po.metric.ms_ssim(img, img + torch.rand_like(img))
    tensor([[0.4684]])
    """
    if power_factors is None:
        power_factors = [0.0448, 0.2856, 0.3001, 0.2363, 0.1333]

    def downsample(img: torch.Tensor) -> torch.Tensor:  # numpydoc ignore=GL08
        img = F.pad(img, (0, img.shape[3] % 2, 0, img.shape[2] % 2), mode="replicate")
        img = F.avg_pool2d(img, kernel_size=2)
        return img

    msssim = 1
    for i in range(len(power_factors) - 1):
        _, contrast_structure_map, _ = _ssim_parts(img1, img2, func_name="MS-SSIM")
        msssim *= F.relu(contrast_structure_map.mean((-1, -2))).pow(power_factors[i])
        img1 = downsample(img1)
        img2 = downsample(img2)
    map_ssim, _, _ = _ssim_parts(img1, img2, func_name="MS-SSIM")
    msssim *= F.relu(map_ssim.mean((-1, -2))).pow(power_factors[-1])

    if min(img1.shape[2], img1.shape[3]) < 11:
        warnings.warn(
            "SSIM uses 11x11 convolutional kernel, but for some scales "
            "of the input image, the height and/or the width is smaller "
            "than 11, so the kernel size in SSIM is set to be the "
            "minimum of these two numbers for these scales."
        )
    return msssim




[docs]
def nlpd(
    img1: torch.Tensor,
    img2: torch.Tensor,
    epsilon: float = 1e-10,
) -> torch.Tensor:
    """
    Compute the normalized Laplacian Pyramid Distance.

    As described in Laparra et al., 2016 [12]_, this is an image quality metric based on
    the transformations associated with the early visual system: local luminance
    subtraction and local contrast gain control.

    A laplacian pyramid subtracts a local estimate of the mean luminance at six scales.
    Then a local gain control divides these centered coefficients by a weighted sum of
    absolute values in spatial neighborhood.

    These weights parameters were optimized for redundancy reduction over an training
    database of (undistorted) natural images, as described in the paper. Parameters were
    copied from matlab code used for the paper, found online [13]_.

    Note that we compute root mean squared error for each scale, and then average over
    these, effectively giving larger weight to the lower frequency coefficients
    (which are fewer in number, due to subsampling).

    .. versionchanged:: 2.0.0
       Made ``epsilon`` parameter public (previously, was hard-coded to ``1e-10``).

    Parameters
    ----------
    img1
        The first image or batch of images of shape (batch, channel, height, width).
    img2
        The second image or batch of images of shape (batch, channel, height, width).
        The heights and widths of ``img1`` and ``img2`` must be the same. The numbers of
        batches and channels of ``img1`` and ``img2`` need to be broadcastable: either
        they are the same or one of them is 1. The output will be computed separately
        for each channel (so channels are treated in the same way as batches). Both
        images should have values between 0 and 1. Otherwise, the result may be
        inaccurate, and we will raise a warning (but will still compute it).
    epsilon
        Float added to the computation of the average distance on each band, helpful for
        stabilizing gradient around zero for optimization purposes. Note that the
        distance between two identical images will thus be ``sqrt(epsilon)`` (see
        Examples).

    Returns
    -------
    distance
        The normalized Laplacian Pyramid distance, with shape (batch, channel).

    Raises
    ------
    ValueError
        If either ``img1`` or ``img2`` is not 4d.
    ValueError
        If ``img1`` and ``img2`` have different height or width.
    ValueError
        If ``img1`` and ``img2`` have different batch or channel, unless one of them has
        a 1 there, so they can be broadcast.
    ValueError
        If ``img1`` and ``img2`` have different dtypes.

    Warns
    -----
    UserWarning
        If either ``img1`` or ``img2`` has multiple channels, as SSIM was designed for
        grayscale images.
    UserWarning
        If either ``img1`` or ``img2`` has a value outside of range ``[0, 1]``.

    References
    ----------
    .. [12] Laparra, V., Ballé, J., Berardino, A. and Simoncelli, E.P., 2016. Perceptual
        image quality assessment using a normalized Laplacian pyramid. Electronic
        Imaging, 2016(16), pp.1-6.
    .. [13] matlab code: `<https://www.cns.nyu.edu/~lcv/NLPyr/NLP_dist.m>`_

    Examples
    --------
    >>> import plenoptic as po
    >>> einstein_img = po.data.einstein()
    >>> curie_img = po.data.curie()
    >>> po.metric.nlpd(einstein_img, curie_img)
    tensor([[1.3507]])

    Distance between two identical images is ``sqrt(epsilon)`` (this is useful for
    stabilitizing gradient around zero during optimization):

    >>> po.metric.nlpd(einstein_img, einstein_img)
    tensor([[1.0000e-05]])
    >>> po.metric.nlpd(einstein_img, einstein_img, epsilon=1e-20)
    tensor([[1.0000e-10]])
    """
    if not img1.ndim == img2.ndim == 4:
        raise ValueError(
            "Input images should have four dimensions: (batch, channel, height, width)"
        )
    if img1.shape[-2:] != img2.shape[-2:]:
        raise ValueError("img1 and img2 must have the same height and width!")
    for i in range(2):
        if img1.shape[i] != img2.shape[i] and img1.shape[i] != 1 and img2.shape[i] != 1:
            raise ValueError(
                "Either img1 and img2 should have the same number of "
                "elements in the batch and channel dimensions, or one of "
                "them should be 1! But got shapes "
                f"{img1.shape}, {img2.shape} instead"
            )
    if img1.shape[1] > 1 or img2.shape[1] > 1:
        warnings.warn(
            "NLPD was designed for grayscale images and here it will be"
            " computed separately for each channel (so channels are treated in"
            " the same way as batches)."
        )

    img_ranges = torch.stack([img1.min(), img1.max(), img2.min(), img2.max()])
    if (img_ranges > 1).any() or (img_ranges < 0).any():
        warnings.warn(
            "Image range falls outside [0, 1]. NLPD output may not make sense."
        )

    y1 = normalized_laplacian_pyramid(img1)
    y2 = normalized_laplacian_pyramid(img2)

    dist = []
    for i in range(6):
        dist.append(torch.sqrt(torch.mean((y1[i] - y2[i]) ** 2, dim=(2, 3)) + epsilon))

    return torch.stack(dist).mean(dim=0)