Source code for pennylane.optimize.shot_adaptive
# Copyright 2018-2021 Xanadu Quantum Technologies Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shot adaptive optimizer"""
# pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-branches
from copy import copy
import numpy as np
from scipy.stats import multinomial
import pennylane as qml
from .gradient_descent import GradientDescentOptimizer
[docs]class ShotAdaptiveOptimizer(GradientDescentOptimizer):
r"""Optimizer where the shot rate is adaptively calculated using the variances of the
parameter-shift gradient.
By keeping a running average of the parameter-shift gradient and the *variance*
of the parameter-shift gradient, this optimizer frugally distributes a shot
budget across the partial derivatives of each parameter.
In addition, weighted random sampling can be used to further distribute the
shot budget across the local terms from which the Hamiltonian is constructed.
.. note::
The shot adaptive optimizer only supports single QNode objects as objective functions.
The bound device must also be instantiated with a finite number of shots.
Args:
min_shots (int): The minimum number of shots used to estimate the expectations
of each term in the Hamiltonian. Note that this must be larger than 2 for the variance
of the gradients to be computed.
mu (float): The running average constant :math:`\mu \in [0, 1]`. Used to control how
quickly the number of shots recommended for each gradient component changes.
b (float): Regularization bias. The bias should be kept small, but non-zero.
term_sampling (str): The random sampling algorithm to multinomially distribute the shot
budget across terms in the Hamiltonian expectation value. Currently, only
``"weighted_random_sampling"`` is supported. The default value is ``None``, which
disables the random sampling behaviour.
stepsize (float): The learning rate :math:`\eta`. The learning rate *must* be such
that :math:`\eta < 2/L = 2/\sum_i|c_i|`, where:
* :math:`L \leq \sum_i|c_i|` is the bound on the `Lipschitz constant
<https://en.wikipedia.org/wiki/Lipschitz_continuity>`__ of the variational quantum
algorithm objective function, and
* :math:`c_i` are the coefficients of the Hamiltonian used in the objective function.
**Example**
For VQE/VQE-like problems, the objective function for the optimizer can be realized
as a :class:`~.QNode` object measuring the expectation of a :class:`~.ops.LinearCombination`.
>>> from pennylane import numpy as np
>>> coeffs = [2, 4, -1, 5, 2]
>>> obs = [
... qml.X(1),
... qml.Z(1),
... qml.X(0) @ qml.X(1),
... qml.Y(0) @ qml.Y(1),
... qml.Z(0) @ qml.Z(1)
... ]
>>> H = qml.Hamiltonian(coeffs, obs)
>>> dev = qml.device("default.qubit", wires=2, shots=100)
>>> @qml.qnode(dev)
>>> def cost(weights):
... qml.StronglyEntanglingLayers(weights, wires=range(2))
... return qml.expval(H)
Once constructed, the cost function can be passed directly to the
optimizer's ``step`` method. The attributes ``opt.shots_used`` and
``opt.total_shots_used`` can be used to track the number of shots per
iteration, and across the life of the optimizer, respectively.
>>> shape = qml.templates.StronglyEntanglingLayers.shape(n_layers=2, n_wires=2)
>>> params = np.random.random(shape)
>>> opt = qml.ShotAdaptiveOptimizer(min_shots=10, term_sampling="weighted_random_sampling")
>>> for i in range(60):
... params = opt.step(cost, params)
... print(f"Step {i}: cost = {cost(params):.2f}, shots_used = {opt.total_shots_used}")
Step 0: cost = -5.69, shots_used = 240
Step 1: cost = -2.98, shots_used = 336
Step 2: cost = -4.97, shots_used = 624
Step 3: cost = -5.53, shots_used = 1054
Step 4: cost = -6.50, shots_used = 1798
Step 5: cost = -6.68, shots_used = 2942
Step 6: cost = -6.99, shots_used = 4350
Step 7: cost = -6.97, shots_used = 5814
Step 8: cost = -7.00, shots_used = 7230
Step 9: cost = -6.69, shots_used = 9006
Step 10: cost = -6.85, shots_used = 11286
Step 11: cost = -6.63, shots_used = 14934
Step 12: cost = -6.86, shots_used = 17934
Step 13: cost = -7.19, shots_used = 22950
Step 14: cost = -6.99, shots_used = 28302
Step 15: cost = -7.38, shots_used = 34134
Step 16: cost = -7.66, shots_used = 41022
Step 17: cost = -7.21, shots_used = 48918
Step 18: cost = -7.53, shots_used = 56286
Step 19: cost = -7.46, shots_used = 63822
Step 20: cost = -7.31, shots_used = 72534
Step 21: cost = -7.23, shots_used = 82014
Step 22: cost = -7.31, shots_used = 92838
.. details::
:title: Usage Details
The shot adaptive optimizer is based on the iCANS1 optimizer by
`Kübler et al. (2020) <https://quantum-journal.org/papers/q-2020-05-11-263/>`__, and works
as follows:
1. The initial step of the optimizer is performed with some specified minimum
number of shots, :math:`s_{min}`, for all partial derivatives.
2. The parameter-shift rule is then used to estimate the gradient :math:`g_i` with :math:`s_i` shots
for each parameter :math:`\theta_i`, parameters, as well as the variances
:math:`v_i` of the estimated gradients.
3. Gradient descent is performed for each parameter :math:`\theta_i`, using
the pre-defined learning rate :math:`\eta` and the gradient information :math:`g_i`:
:math:`\theta_i \rightarrow \theta_i - \eta g_i`.
4. A maximum shot number is set by maximizing the improvement in the expected gain per shot.
For a specific parameter value, the improvement in the expected gain per shot
is then calculated via
.. math::
\gamma_i = \frac{1}{s_i} \left[ \left(\eta - \frac{1}{2} L\eta^2\right)
g_i^2 - \frac{L\eta^2}{2s_i}v_i \right],
where:
* :math:`L \leq \sum_i|c_i|` is the bound on the `Lipschitz constant
<https://en.wikipedia.org/wiki/Lipschitz_continuity>`__ of the variational quantum algorithm objective function,
* :math:`c_i` are the coefficients of the Hamiltonian, and
* :math:`\eta` is the learning rate, and *must* be bound such that :math:`\eta < 2/L`
for the above expression to hold.
5. Finally, the new values of :math:`s_{i+1}` (shots for partial derivative of parameter
:math:`\theta_i`) is given by:
.. math::
s_{i+1} = \frac{2L\eta}{2-L\eta}\left(\frac{v_i}{g_i^2}\right)\propto
\frac{v_i}{g_i^2}.
In addition to the above, to counteract the presence of noise in the system, a
running average of :math:`g_i` and :math:`s_i` (:math:`\chi_i` and :math:`\xi_i` respectively)
are used when computing :math:`\gamma_i` and :math:`s_i`.
For more details, see:
* Andrew Arrasmith, Lukasz Cincio, Rolando D. Somma, and Patrick J. Coles. "Operator Sampling
for Shot-frugal Optimization in Variational Algorithms." `arXiv:2004.06252
<https://arxiv.org/abs/2004.06252>`__ (2020).
* Jonas M. Kübler, Andrew Arrasmith, Lukasz Cincio, and Patrick J. Coles. "An Adaptive Optimizer
for Measurement-Frugal Variational Algorithms." `Quantum 4, 263
<https://quantum-journal.org/papers/q-2020-05-11-263/>`__ (2020).
"""
def __init__(self, min_shots, term_sampling=None, mu=0.99, b=1e-6, stepsize=0.07):
self.term_sampling = term_sampling
self.trainable_args = set()
# hyperparameters
self.min_shots = min_shots
self.mu = mu # running average constant
self.b = b # regularization bias
self.lipschitz = None
self.shots_used = 0
"""int: number of shots used on the current iteration"""
self.total_shots_used = 0
"""int: total number of shots used across all iterations"""
# total number of iterations
self.k = 0
# Number of shots per parameter
self.s = None
# maximum number of shots required to evaluate across all parameters
self.max_shots = None
# Running average of the parameter gradients
self.chi = None
# Running average of the variance of the parameter gradients
self.xi = None
super().__init__(stepsize=stepsize)
[docs] @staticmethod
def qnode_weighted_random_sampling(qnode, coeffs, observables, shots, argnums, *args, **kwargs):
"""Returns an array of length ``shots`` containing single-shot estimates
of the Hamiltonian gradient. The shots are distributed randomly over
the terms in the Hamiltonian, as per a multinomial distribution.
Args:
qnode (.QNode): A QNode that returns the expectation value of a Hamiltonian.
coeffs (List[float]): The coefficients of the Hamiltonian being measured
observables (List[Observable]): The terms of the Hamiltonian being measured
shots (int): The number of shots used to estimate the Hamiltonian expectation
value. These shots are distributed over the terms in the Hamiltonian,
as per a Multinomial distribution.
argnums (Sequence[int]): the QNode argument indices which are trainable
*args: Arguments to the QNode
**kwargs: Keyword arguments to the QNode
Returns:
array[float]: the single-shot gradients of the Hamiltonian expectation value
"""
qnode = copy(qnode)
base_func = qnode.func
# determine the shot probability per term
prob_shots = np.abs(coeffs) / np.sum(np.abs(coeffs))
# construct the multinomial distribution, and sample
# from it to determine how many shots to apply per term
si = multinomial(n=shots, p=prob_shots)
shots_per_term = si.rvs()[0]
grads = []
for o, c, p, s in zip(observables, coeffs, prob_shots, shots_per_term):
# if the number of shots is 0, do nothing
if s == 0:
continue
def func(*qnode_args, **qnode_kwargs):
qs = qml.tape.make_qscript(base_func)(*qnode_args, **qnode_kwargs)
for op in qs.operations:
qml.apply(op)
return qml.expval(o) # pylint:disable=cell-var-from-loop
qnode.func = func
new_shots = 1 if s == 1 else [(1, int(s))]
if s > 1:
def cost(*args, **kwargs):
# pylint: disable=cell-var-from-loop
return qml.math.stack(qnode(*args, **kwargs))
else:
cost = qnode
jacs = qml.jacobian(cost, argnum=argnums)(*args, **kwargs, shots=new_shots)
if s == 1:
jacs = [np.expand_dims(j, 0) for j in jacs]
# Divide each term by the probability per shot. This is
# because we are sampling one at a time.
grads.append([(c * j / p) for j in jacs])
return [np.concatenate(i) for i in zip(*grads)]
[docs] @staticmethod
def check_device(dev):
r"""Verifies that the device used by the objective function is non-analytic.
Args:
dev (.devices.Device): the device to verify
Raises:
ValueError: if the device is analytic
"""
if not dev.shots:
raise ValueError(
"The Rosalin optimizer can only be used with devices "
"that estimate expectation values with a finite number of shots."
)
[docs] def check_learning_rate(self, coeffs):
r"""Verifies that the learning rate is less than 2 over the Lipschitz constant,
where the Lipschitz constant is given by :math:`\sum |c_i|` for Hamiltonian
coefficients :math:`c_i`.
Args:
coeffs (Sequence[float]): the coefficients of the terms in the Hamiltonian
Raises:
ValueError: if the learning rate is large than :math:`2/\sum |c_i|`
"""
self.lipschitz = np.sum(np.abs(coeffs))
if self.stepsize > 2 / self.lipschitz:
raise ValueError(f"The learning rate must be less than {2 / self.lipschitz}")
def _single_shot_qnode_gradients(self, qnode, args, kwargs):
"""Compute the single shot gradients of a QNode."""
self.check_device(qnode.device)
tape = qml.workflow.construct_tape(qnode)(*args, **kwargs)
[expval] = tape.measurements
coeffs, observables = (
expval.obs.terms()
if isinstance(expval.obs, qml.ops.LinearCombination)
else ([1.0], [expval.obs])
)
if self.lipschitz is None:
self.check_learning_rate(coeffs)
if self.term_sampling == "weighted_random_sampling":
return self.qnode_weighted_random_sampling(
qnode, coeffs, observables, self.max_shots, self.trainable_args, *args, **kwargs
)
if self.term_sampling is not None:
raise ValueError(
f"Unknown Hamiltonian term sampling method {self.term_sampling}. "
"Only term_sampling='weighted_random_sampling' and "
"term_sampling=None currently supported."
)
new_shots = [(1, int(self.max_shots))]
def cost(*args, **kwargs):
return qml.math.stack(qnode(*args, **kwargs, shots=new_shots))
grads = [qml.jacobian(cost, argnum=i)(*args, **kwargs) for i in self.trainable_args]
return grads
[docs] def compute_grad(
self, objective_fn, args, kwargs
): # pylint: disable=signature-differs,arguments-differ,arguments-renamed
r"""Compute the gradient of the objective function, as well as the variance of the gradient,
at the given point.
Args:
objective_fn (function): the objective function for optimization
args: arguments to the objective function
kwargs: keyword arguments to the objective function
Returns:
tuple[array[float], array[float]]: a tuple of NumPy arrays containing the gradient
:math:`\nabla f(x^{(t)})` and the variance of the gradient
"""
if isinstance(objective_fn, qml.QNode) or hasattr(objective_fn, "device"):
grads = self._single_shot_qnode_gradients(objective_fn, args, kwargs)
else:
raise ValueError(
"The objective function must be encoded as a single QNode object for the shot "
"adaptive optimizer. "
)
# grads will have dimension [max(self.s), *params.shape]
# For each parameter, we want to truncate the number of shots to self.s[idx],
# and take the mean and the variance.
gradients = []
gradient_variances = []
for i, grad in enumerate(grads):
p_ind = np.ndindex(*grad.shape[1:])
g = np.zeros_like(grad[0])
s = np.zeros_like(grad[0])
for idx in p_ind:
grad_slice = grad[(slice(0, self.s[i][idx]),) + idx]
g[idx] = np.mean(grad_slice)
s[idx] = np.var(grad_slice, ddof=1)
gradients.append(g)
gradient_variances.append(s)
return gradients, gradient_variances
[docs] def step(self, objective_fn, *args, **kwargs):
"""Update trainable arguments with one step of the optimizer.
Args:
objective_fn (function): the objective function for optimization
*args: variable length argument list for objective function
**kwargs: variable length of keyword arguments for the objective function
Returns:
list[array]: The new variable values :math:`x^{(t+1)}`.
If single arg is provided, list[array] is replaced by array.
"""
self.trainable_args = set()
for index, arg in enumerate(args):
if getattr(arg, "requires_grad", False):
self.trainable_args |= {index}
if self.s is None:
# Number of shots per parameter
self.s = [
np.zeros_like(a, dtype=np.int64) + self.min_shots
for i, a in enumerate(args)
if i in self.trainable_args
]
# keep track of the number of shots run
s = np.concatenate([i.flatten() for i in self.s])
self.max_shots = max(s)
self.shots_used = int(2 * np.sum(s))
self.total_shots_used += self.shots_used
# compute the gradient, as well as the variance in the gradient,
# using the number of shots determined by the array s.
grads, grad_variances = self.compute_grad(objective_fn, args, kwargs)
new_args = self.apply_grad(grads, args)
if self.xi is None:
self.chi = [np.zeros_like(g, dtype=np.float64) for g in grads]
self.xi = [np.zeros_like(g, dtype=np.float64) for g in grads]
# running average of the gradient
self.chi = [self.mu * c + (1 - self.mu) * g for c, g in zip(self.chi, grads)]
# running average of the gradient variance
self.xi = [self.mu * x + (1 - self.mu) * v for x, v in zip(self.xi, grad_variances)]
for idx, (c, x) in enumerate(zip(self.chi, self.xi)):
xi = x / (1 - self.mu ** (self.k + 1))
chi = c / (1 - self.mu ** (self.k + 1))
# determine the new optimum shots distribution for the next
# iteration of the optimizer
s = np.ceil(
(2 * self.lipschitz * self.stepsize * xi)
/ ((2 - self.lipschitz * self.stepsize) * (chi**2 + self.b * (self.mu**self.k)))
)
# apply an upper and lower bound on the new shot distributions,
# to avoid the number of shots reducing below min(2, min_shots),
# or growing too significantly.
gamma = (
(self.stepsize - self.lipschitz * self.stepsize**2 / 2) * chi**2
- xi * self.lipschitz * self.stepsize**2 / (2 * s)
) / s
argmax_gamma = np.unravel_index(np.argmax(gamma), gamma.shape)
smax = max(s[argmax_gamma], 2)
self.s[idx] = np.int64(np.clip(s, max(2, self.min_shots), smax))
self.k += 1
# unwrap from list if one argument, cleaner return
if len(new_args) == 1:
return new_args[0]
return new_args
[docs] def step_and_cost(self, objective_fn, *args, **kwargs):
"""Update trainable arguments with one step of the optimizer and return the corresponding
objective function value prior to the step.
The objective function will be evaluated using the maximum number of shots
across all parameters as determined by the optimizer during the
optimization step.
.. warning::
Unlike other gradient descent optimizers, the objective function will be evaluated
**separately** to the gradient computation, and will result in extra
device evaluations.
Args:
objective_fn (function): the objective function for optimization
*args : variable length argument list for objective function
**kwargs : variable length of keyword arguments for the objective function
Returns:
tuple[list [array], float]: the new variable values :math:`x^{(t+1)}` and the objective
function output prior to the step.
If single arg is provided, list [array] is replaced by array.
"""
new_args = self.step(objective_fn, *args, **kwargs)
forward = objective_fn(*args, **kwargs, shots=int(self.max_shots))
return new_args, forward
_modules/pennylane/optimize/shot_adaptive
Download Python script
Download Notebook
View on GitHub