# Copyright 2018-2021 Xanadu Quantum Technologies Inc.# Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at# http://www.apache.org/licenses/LICENSE-2.0# Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Adam optimizer"""fromnumpyimportsqrtfrom.gradient_descentimportGradientDescentOptimizer
[docs]classAdamOptimizer(GradientDescentOptimizer):r"""Gradient-descent optimizer with adaptive learning rate, first and second moment. Adaptive Moment Estimation uses a step-dependent learning rate, a first moment :math:`a` and a second moment :math:`b`, reminiscent of the momentum and velocity of a particle: .. math:: x^{(t+1)} = x^{(t)} - \eta^{(t+1)} \frac{a^{(t+1)}}{\sqrt{b^{(t+1)}} + \epsilon }, where the update rules for the two moments are given by .. math:: a^{(t+1)} &= \beta_1 a^{(t)} + (1-\beta_1) \nabla f(x^{(t)}),\\ b^{(t+1)} &= \beta_2 b^{(t)} + (1-\beta_2) (\nabla f(x^{(t)}))^{\odot 2},\\ \eta^{(t+1)} &= \eta \frac{\sqrt{(1-\beta_2^{t+1})}}{(1-\beta_1^{t+1})}. Above, :math:`( \nabla f(x^{(t-1)}))^{\odot 2}` denotes the element-wise square operation, which means that each element in the gradient is multiplied by itself. The hyperparameters :math:`\beta_1` and :math:`\beta_2` can also be step-dependent. Initially, the first and second moment are zero. The shift :math:`\epsilon` avoids division by zero. For more details, see `arXiv:1412.6980 <https://arxiv.org/abs/1412.6980>`_. Args: stepsize (float): the user-defined hyperparameter :math:`\eta` beta1 (float): hyperparameter governing the update of the first and second moment beta2 (float): hyperparameter governing the update of the first and second moment eps (float): offset :math:`\epsilon` added for numerical stability .. note:: When using ``torch``, ``tensorflow`` or ``jax`` interfaces, refer to :doc:`Gradients and training </introduction/interfaces>` for suitable optimizers. """def__init__(self,stepsize=0.01,beta1=0.9,beta2=0.99,eps=1e-8):super().__init__(stepsize)self.beta1=beta1self.beta2=beta2self.eps=epsself.accumulation=None
[docs]defapply_grad(self,grad,args):r"""Update the variables args to take a single optimization step. Flattens and unflattens the inputs to maintain nested iterables as the parameters of the optimization. Args: grad (tuple[ndarray]): the gradient of the objective function at point :math:`x^{(t)}`: :math:`\nabla f(x^{(t)})` args (tuple): the current value of the variables :math:`x^{(t)}` Returns: list: the new values :math:`x^{(t+1)}` """args_new=list(args)ifself.accumulationisNone:self.accumulation={"fm":[0]*len(args),"sm":[0]*len(args),"t":0}self.accumulation["t"]+=1# Update step size (instead of correcting for bias)new_stepsize=(self.stepsize*sqrt(1-self.beta2**self.accumulation["t"])/(1-self.beta1**self.accumulation["t"]))trained_index=0forindex,arginenumerate(args):ifgetattr(arg,"requires_grad",False):self._update_accumulation(index,grad[trained_index])args_new[index]=arg-new_stepsize*self.accumulation["fm"][index]/(sqrt(self.accumulation["sm"][index])+self.eps)trained_index+=1returnargs_new
def_update_accumulation(self,index,grad):r"""Update the moments. Args: index (int): the index of the argument to update grad (ndarray): the gradient for that trainable param """# update first momentself.accumulation["fm"][index]=(self.beta1*self.accumulation["fm"][index]+(1-self.beta1)*grad)# update second momentself.accumulation["sm"][index]=(self.beta2*self.accumulation["sm"][index]+(1-self.beta2)*grad**2)
[docs]defreset(self):"""Reset optimizer by erasing memory of past steps."""self.accumulation=None
@propertydeffm(self):"""Returns estimated first moments of gradient"""returnNoneifself.accumulationisNoneelseself.accumulation["fm"]@propertydefsm(self):"""Returns estimated second moments of gradient"""returnNoneifself.accumulationisNoneelseself.accumulation["sm"]@propertydeft(self):"""Returns accumulated timesteps"""returnNoneifself.accumulationisNoneelseself.accumulation["t"]