Source code for pennylane.gradients.vjp

# Copyright 2018-2021 Xanadu Quantum Technologies Inc.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains functions for computing the vector-Jacobian product
of tapes.
"""
import autograd

# pylint: disable=no-member, too-many-branches
import numpy as np

import pennylane as qml


def _convert(jac, dy_row):
    """Utility to convert and cast the jacobian as dy_row."""
    if isinstance(jac, tuple):
        jac_new = []
        for j in jac:
            j_ = qml.math.convert_like(j, dy_row)
            j_ = qml.math.cast_like(j_, dy_row)
            jac_new.append(j_)
        jac = tuple(jac_new)
    else:
        jac = qml.math.convert_like(jac, dy_row)
        jac = qml.math.cast_like(jac, dy_row)
    return jac


def _all_close_to_zero(dy):
    """
    Check if all entries of dy are close to 0. dy can also be a nested tuple
    structure of tensors, in which case this returns True iff all tensors are
    close to 0
    """
    if not isinstance(dy, (list, tuple)):
        return qml.math.allclose(dy, 0)

    # call this method recursively
    return all(_all_close_to_zero(dy_) for dy_ in dy)


[docs]def compute_vjp_single(dy, jac, num=None):
    """Convenience function to compute the vector-Jacobian product for a given
    vector of gradient outputs and a Jacobian for a single measurement tape.

    Args:
        dy (tensor_like): vector of gradient outputs
        jac (tensor_like, tuple): Jacobian matrix
        num (int): The length of the flattened ``dy`` argument. This is an
            optional argument, but can be useful to provide if ``dy`` potentially
            has no shape (for example, due to tracing or just-in-time compilation).

    Returns:
        tensor_like: the vector-Jacobian product

    **Examples**

    1. For a single parameter and a single measurement without shape (e.g. expval, var):

    .. code-block:: pycon

        >>> jac = np.array(0.1)
        >>> dy = np.array(2)
        >>> compute_vjp_single(dy, jac)
        array([0.2])

    2. For a single parameter and a single measurement with shape (e.g. probs):

    .. code-block:: pycon

        >>> jac = np.array([0.1, 0.2])
        >>> dy = np.array([1.0, 1.0])
        >>> compute_vjp_single(dy, jac)
        array([0.3])


    3. For multiple parameters (in this case 2 parameters) and a single measurement without shape (e.g. expval, var):

    .. code-block:: pycon

        >>> jac = tuple([np.array(0.1), np.array(0.2)])
        >>> dy = np.array(2)
        >>> compute_vjp_single(dy, jac)
        array([0.2, 0.4])

    4. For multiple parameters (in this case 2 parameters) and a single measurement with shape (e.g. probs):

    .. code-block:: pycon

        >>> jac = tuple([np.array([0.1, 0.2]), np.array([0.3, 0.4])])
        >>> dy = np.array([1.0, 2.0])
        >>> compute_vjp_single(dy, jac)
        array([0.5, 1.1])

    """
    if jac is None:
        return None

    dy_row = qml.math.reshape(dy, [-1])

    if num is None:
        num = qml.math.shape(dy_row)[0]

    if not isinstance(dy_row, np.ndarray):
        jac = _convert(jac, dy_row)

    # Note: For generality, all exception type warnings are disabled.
    # TODO: Excplictly catalogue and update raises for known types.

    # Single measurement with a single param
    if not isinstance(jac, (tuple, autograd.builtins.SequenceBox)):
        # No trainable parameters
        if jac.shape == (0,):
            res = qml.math.zeros((1, 0))
            return res
        # Single measurement with no dimension e.g. expval or with dimension e.g. probs
        if num == 1:
            jac = qml.math.squeeze(jac)
        jac = qml.math.reshape(jac, (-1, 1))
        try:
            res = dy_row @ jac

        except Exception:  # pylint: disable=broad-except
            res = qml.math.tensordot(jac, dy_row, [[0], [0]])

    # Single measurement with multiple params
    else:
        # No trainable parameters (adjoint)
        if len(jac) == 0:
            res = qml.math.zeros((1, 0))
            return res
        # Single measurement with no dimension e.g. expval
        if num == 1:
            jac = qml.math.reshape(qml.math.stack(jac), (1, -1))
            try:
                res = dy_row @ jac
            except Exception:  # pylint: disable=broad-except
                res = qml.math.tensordot(jac, dy_row, [[0], [0]])

        # Single measurement with dimension e.g. probs
        else:
            jac = qml.math.reshape(qml.math.stack(jac), (-1, num))
            try:
                res = jac @ dy_row
            except Exception:  # pylint: disable=broad-except
                res = qml.math.tensordot(jac, dy_row, [[1], [0]])

    return res


[docs]def compute_vjp_multi(dy, jac, num=None):
    """Convenience function to compute the vector-Jacobian product for a given
    vector of gradient outputs and a Jacobian for a tape with multiple measurements.

    Args:
        dy (tensor_like): vector of gradient outputs
        jac (tensor_like, tuple): Jacobian matrix
        num (int): The length of the flattened ``dy`` argument. This is an
            optional argument, but can be useful to provide if ``dy`` potentially
            has no shape (for example, due to tracing or just-in-time compilation).

    Returns:
        tensor_like: the vector-Jacobian product

    **Examples**

    1. For a single parameter and multiple measurement (one without shape and one with shape, e.g. expval and probs):

    .. code-block:: pycon

        >>> jac = tuple([np.array(0.1), np.array([0.3, 0.4])])
        >>> dy = tuple([np.array(1.0), np.array([1.0, 2.0])])
        >>> compute_vjp_multi(dy, jac)
        array([1.2])

    2. For multiple parameters (in this case 2 parameters) and multiple measurement (one without shape and one with
    shape, e.g. expval and probs):

    .. code-block:: pycon

        >>> jac = ((np.array(0.1), np.array(0.2)), (np.array([0.3, 0.4]), np.array([0.5, 0.6])))
        >>> dy = tuple([np.array(1.0), np.array([1.0, 2.0])])
        >>> compute_vjp_multi(dy, jac)
        array([1.2, 1.9])

    """
    if jac is None:
        return None

    # Single parameter
    if not isinstance(jac[0], (tuple, autograd.builtins.SequenceBox)):
        res = []
        for d, j_ in zip(dy, jac):
            res.append(compute_vjp_single(d, j_, num=num))
        res = qml.math.sum(qml.math.stack(res), axis=0)
    # Multiple parameters
    else:
        try:
            dy_interface = qml.math.get_interface(dy[0])
            # dy  -> (i,j)      observables, entries per observable
            # jac -> (i,k,j)    observables, parameters, entries per observable
            # Contractions over observables and entries per observable
            dy_shape = qml.math.shape(dy)
            if len(dy_shape) > 1:  # multiple values exist per observable output
                return qml.math.array(qml.math.einsum("ij,i...j", dy, jac), like=dy[0])

            if dy_interface == "tensorflow":
                # TF needs a different path for Hessian support
                return qml.math.array(
                    qml.math.einsum("i,i...", dy, jac, like=dy[0]), like=dy[0]
                )  # Scalar value per observable output
            return qml.math.array(
                qml.math.einsum("i,i...", dy, jac), like=dy[0]
            )  # Scalar value per observable output
        # NOTE: We want any potential failure to fall back here, so catch every exception type
        # TODO: Catalogue and update for expected exception types
        except Exception:  # pylint: disable=broad-except
            res = []
            for d, j_ in zip(dy, jac):
                sub_res = []
                for j in j_:
                    sub_res.append(qml.math.squeeze(compute_vjp_single(d, j, num=num)))
                res.append(sub_res)
            res = qml.math.stack([qml.math.stack(r) for r in res])
            res = qml.math.sum(res, axis=0)
    return res


[docs]def vjp(tape, dy, gradient_fn, gradient_kwargs=None):
    r"""Generate the gradient tapes and processing function required to compute
    the vector-Jacobian products of a tape.

    Consider a function :math:`\mathbf{f}(\mathbf{x})`. The Jacobian is given by

    .. math::

        \mathbf{J}_{\mathbf{f}}(\mathbf{x}) = \begin{pmatrix}
            \frac{\partial f_1}{\partial x_1} &\cdots &\frac{\partial f_1}{\partial x_n}\\
            \vdots &\ddots &\vdots\\
            \frac{\partial f_m}{\partial x_1} &\cdots &\frac{\partial f_m}{\partial x_n}\\
        \end{pmatrix}.

    During backpropagation, the chain rule is applied. For example, consider the
    cost function :math:`h = y\circ f: \mathbb{R}^n \rightarrow \mathbb{R}`,
    where :math:`y: \mathbb{R}^m \rightarrow \mathbb{R}`.
    The gradient is:

    .. math::

        \nabla h(\mathbf{x}) = \frac{\partial y}{\partial \mathbf{f}} \frac{\partial \mathbf{f}}{\partial \mathbf{x}}
        = \frac{\partial y}{\partial \mathbf{f}} \mathbf{J}_{\mathbf{f}}(\mathbf{x}).

    Denote :math:`d\mathbf{y} = \frac{\partial y}{\partial \mathbf{f}}`; we can write this in the form
    of a matrix multiplication:

    .. math:: \left[\nabla h(\mathbf{x})\right]_{j} = \sum_{i=0}^m d\mathbf{y}_i ~ \mathbf{J}_{ij}.

    Thus, we can see that the gradient of the cost function is given by the so-called
    **vector-Jacobian product**; the product of the row-vector :math:`d\mathbf{y}`, representing
    the gradient of subsequent components of the cost function, and :math:`\mathbf{J}`,
    the Jacobian of the current node of interest.

    Args:
        tape (.QuantumTape): quantum tape to differentiate
        dy (tensor_like): Gradient-output vector. Must have shape
            matching the output shape of the corresponding tape.
        gradient_fn (callable): the gradient transform to use to differentiate
            the tape
        gradient_kwargs (dict): dictionary of keyword arguments to pass when
            determining the gradients of tapes

    Returns:
        tensor_like or None: Vector-Jacobian product. Returns None if the tape
        has no trainable parameters.

    **Example**

    Consider the following quantum tape with PyTorch parameters:

    .. code-block:: python

        import torch

        x = torch.tensor([[0.1, 0.2, 0.3],
                          [0.4, 0.5, 0.6]], requires_grad=True, dtype=torch.float64)

        ops = [
            qml.RX(x[0, 0], wires=0),
            qml.RY(x[0, 1], wires=1),
            qml.RZ(x[0, 2], wires=0),
            qml.CNOT(wires=[0, 1]),
            qml.RX(x[1, 0], wires=1),
            qml.RY(x[1, 1], wires=0),
            qml.RZ(x[1, 2], wires=1)
        ]
        measurements = [qml.expval(qml.Z(0)), qml.probs(wires=1)]
        tape = qml.tape.QuantumTape(ops, measurements)

    We can use the ``vjp`` function to compute the vector-Jacobian product,
    given a gradient-output vector ``dy``:

    >>> dy = torch.tensor([1., 1., 1.], dtype=torch.float64)
    >>> vjp_tapes, fn = qml.gradients.vjp(tape, dy, qml.gradients.param_shift)

    Note that ``dy`` has shape ``(3,)``, matching the output dimension of the tape
    (1 expectation and 2 probability values).

    Executing the VJP tapes, and applying the processing function:

    >>> dev = qml.device("default.qubit")
    >>> vjp = fn(qml.execute(vjp_tapes, dev, diff_method=qml.gradients.param_shift, interface="torch"))
    >>> vjp
    tensor([-1.1562e-01, -1.3862e-02, -9.0841e-03, -1.5214e-16, -4.8217e-01,
             2.1329e-17], dtype=torch.float64, grad_fn=<SumBackward1>)

    The output VJP is also differentiable with respect to the tape parameters:

    >>> cost = torch.sum(vjp)
    >>> cost.backward()
    >>> x.grad
    tensor([[-1.1025e+00, -2.0554e-01, -1.4917e-01],
            [-1.2490e-16, -9.1580e-01,  0.0000e+00]], dtype=torch.float64)
    """
    gradient_kwargs = gradient_kwargs or {}
    num_params = len(tape.trainable_params)

    if num_params == 0:
        # The tape has no trainable parameters; the VJP
        # is simply none.
        return [], lambda _, num=None: None

    try:
        if _all_close_to_zero(dy):
            # If the dy vector is zero, then the
            # corresponding element of the VJP will be zero,
            # and we can avoid a quantum computation.

            def func(_, num=None):  # pylint: disable=unused-argument
                res = qml.math.convert_like(np.zeros([num_params]), dy)
                multi = len(tape.measurements) > 1
                if multi:
                    multi_dy = dy[0]
                    res = qml.math.convert_like(res, multi_dy)
                    return qml.math.cast_like(res, multi_dy)
                return qml.math.cast_like(res, dy)

            return [], func
    except (AttributeError, TypeError, NotImplementedError):
        pass

    gradient_tapes, fn = gradient_fn(tape, **gradient_kwargs)

    def processing_fn(results, num=None):
        # postprocess results to compute the Jacobian
        jac = fn(results)

        multi = len(tape.measurements) > 1
        comp_vjp_fn = compute_vjp_multi if multi else compute_vjp_single

        if not tape.shots.has_partitioned_shots:
            return comp_vjp_fn(dy, jac, num=num)

        vjp_ = [comp_vjp_fn(dy_, jac_, num=num) for dy_, jac_ in zip(dy, jac)]
        return qml.math.sum(qml.math.stack(vjp_), 0)

    return gradient_tapes, processing_fn


# pylint: disable=too-many-arguments
[docs]def batch_vjp(tapes, dys, gradient_fn, reduction="append", gradient_kwargs=None):
    r"""Generate the gradient tapes and processing function required to compute
    the vector-Jacobian products of a batch of tapes.

    Consider a function :math:`\mathbf{f}(\mathbf{x})`. The Jacobian is given by

    .. math::

        \mathbf{J}_{\mathbf{f}}(\mathbf{x}) = \begin{pmatrix}
            \frac{\partial f_1}{\partial x_1} &\cdots &\frac{\partial f_1}{\partial x_n}\\
            \vdots &\ddots &\vdots\\
            \frac{\partial f_m}{\partial x_1} &\cdots &\frac{\partial f_m}{\partial x_n}\\
        \end{pmatrix}.

    During backpropagation, the chain rule is applied. For example, consider the
    cost function :math:`h = y\circ f: \mathbb{R}^n \rightarrow \mathbb{R}`,
    where :math:`y: \mathbb{R}^m \rightarrow \mathbb{R}`.
    The gradient is:

    .. math::

        \nabla h(\mathbf{x}) = \frac{\partial y}{\partial \mathbf{f}} \frac{\partial \mathbf{f}}{\partial \mathbf{x}}
        = \frac{\partial y}{\partial \mathbf{f}} \mathbf{J}_{\mathbf{f}}(\mathbf{x}).

    Denote :math:`d\mathbf{y} = \frac{\partial y}{\partial \mathbf{f}}`; we can write this in the form
    of a matrix multiplication:

    .. math:: \left[\nabla h(\mathbf{x})\right]_{j} = \sum_{i=0}^m d\mathbf{y}_i ~ \mathbf{J}_{ij}.

    Thus, we can see that the gradient of the cost function is given by the so-called
    **vector-Jacobian product**; the product of the row-vector :math:`d\mathbf{y}`, representing
    the gradient of subsequent components of the cost function, and :math:`\mathbf{J}`,
    the Jacobian of the current node of interest.

    Args:
        tapes (Sequence[.QuantumTape]): sequence of quantum tapes to differentiate
        dys (Sequence[tensor_like]): Sequence of gradient-output vectors ``dy``. Must be the
            same length as ``tapes``. Each ``dy`` tensor should have shape
            matching the output shape of the corresponding tape.
        gradient_fn (callable): the gradient transform to use to differentiate
            the tapes
        reduction (str): Determines how the vector-Jacobian products are returned.
            If ``append``, then the output of the function will be of the form
            ``List[tensor_like]``, with each element corresponding to the VJP of each
            input tape. If ``extend``, then the output VJPs will be concatenated.
        gradient_kwargs (dict): dictionary of keyword arguments to pass when
            determining the gradients of tapes

    Returns:
        List[tensor_like or None]: list of vector-Jacobian products. ``None`` elements corresponds
        to tapes with no trainable parameters.

    **Example**

    Consider the following Torch-compatible quantum tapes:

    .. code-block:: python

        import torch

        x = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True, dtype=torch.float64)

        ops = [
            qml.RX(x[0, 0], wires=0),
            qml.RY(x[0, 1], wires=1),
            qml.RZ(x[0, 2], wires=0),
            qml.CNOT(wires=[0, 1]),
            qml.RX(x[1, 0], wires=1),
            qml.RY(x[1, 1], wires=0),
            qml.RZ(x[1, 2], wires=1)
        ]
        measurements1 = [qml.expval(qml.Z(0)), qml.probs(wires=1)]
        tape1 = qml.tape.QuantumTape(ops, measurements1)

        measurements2 = [qml.expval(qml.Z(0) @ qml.Z(1))]
        tape2 = qml.tape.QuantumTape(ops, measurements2)

        tapes = [tape1, tape2]

    Both tapes share the same circuit ansatz, but have different measurement outputs.

    We can use the ``batch_vjp`` function to compute the vector-Jacobian product,
    given a list of gradient-output vectors ``dys`` per tape:

    >>> dys = [torch.tensor([1., 1., 1.], dtype=torch.float64),
    ...        torch.tensor([1.], dtype=torch.float64)]
    >>> vjp_tapes, fn = qml.gradients.batch_vjp(tapes, dys, qml.gradients.param_shift)

    Note that each ``dy`` has shape matching the output dimension of the tape
    (``tape1`` has 1 expectation and 2 probability values --- 3 outputs --- and ``tape2``
    has 1 expectation value).

    Executing the VJP tapes, and applying the processing function:

    >>> dev = qml.device("default.qubit")
    >>> vjps = fn(qml.execute(vjp_tapes, dev, diff_method=qml.gradients.param_shift, interface="torch"))
    >>> vjps
    [tensor([-1.1562e-01, -1.3862e-02, -9.0841e-03, -1.5214e-16, -4.8217e-01,
          2.1329e-17], dtype=torch.float64, grad_fn=<SumBackward1>),
     tensor([ 1.7393e-01, -1.6412e-01, -5.3983e-03, -2.9366e-01, -4.0083e-01,
              2.1134e-17], dtype=torch.float64, grad_fn=<SqueezeBackward3>)]

    We have two VJPs; one per tape. Each one corresponds to the number of parameters
    on the tapes (6).

    The output VJPs are also differentiable with respect to the tape parameters:

    >>> cost = torch.sum(vjps[0] + vjps[1])
    >>> cost.backward()
    >>> x.grad
    tensor([[-0.4792, -0.9086, -0.2420],
            [-0.0930, -1.0772,  0.0000]], dtype=torch.float64)
    """
    gradient_kwargs = gradient_kwargs or {}
    reshape_info = []
    gradient_tapes = []
    processing_fns = []

    # Loop through the tapes and dys vector
    for tape, dy in zip(tapes, dys):
        g_tapes, fn = vjp(tape, dy, gradient_fn, gradient_kwargs=gradient_kwargs)
        reshape_info.append(len(g_tapes))
        processing_fns.append(fn)
        gradient_tapes.extend(g_tapes)

    def processing_fn(results, nums=None):
        vjps = []
        start = 0

        if nums is None:
            nums = [None] * len(tapes)

        for t_idx in range(len(tapes)):
            # extract the correct results from the flat list
            res_len = reshape_info[t_idx]
            res_t = results[start : start + res_len]
            start += res_len
            # postprocess results to compute the VJP
            vjp_ = processing_fns[t_idx](res_t, num=nums[t_idx])
            if vjp_ is None:
                if reduction == "append":
                    vjps.append(None)
                continue

            if isinstance(reduction, str):
                getattr(vjps, reduction)(vjp_)
            elif callable(reduction):
                reduction(vjps, vjp_)

        return vjps

    return gradient_tapes, processing_fn
Source code for pennylane.gradients.vjp

Contents

Downloads

Source code for pennylane.gradients.vjp

Contents

Downloads

Related