dc/dcb/adam_8py_source.html

import math


import torch

from torch.optim.optimizer import Optimizer


class Adam(Optimizer):

    r"""Implements Adam algorithm.

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    The implementation of the L2 penalty follows changes proposed in

    `Decoupled Weight Decay Regularization`_.

    Arguments:

        params (iterable): iterable of parameters to optimize or dicts defining

            parameter groups

        lr (float, optional): learning rate (default: 1e-3)

        betas (Tuple[float, float], optional): coefficients used for computing

            running averages of gradient and its square (default: (0.9, 0.999))

        eps (float, optional): term added to the denominator to improve

            numerical stability (default: 1e-8)

        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

        amsgrad (boolean, optional): whether to use the AMSGrad variant of this

            algorithm from the paper `On the Convergence of Adam and Beyond`_

            (default: False)

    .. _Adam\: A Method for Stochastic Optimization:

        https://arxiv.org/abs/1412.6980

    .. _Decoupled Weight Decay Regularization:

        https://arxiv.org/abs/1711.05101

    .. _On the Convergence of Adam and Beyond:

        https://openreview.net/forum?id=ryQu7f-RZ

    """


    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,

                 weight_decay=0, amsgrad=False):

        if not 0.0 <= lr:

            raise ValueError("Invalid learning rate: {}".format(lr))

        if not 0.0 <= eps:

            raise ValueError("Invalid epsilon value: {}".format(eps))

        if not 0.0 <= betas[0] < 1.0:

            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))

        if not 0.0 <= betas[1] < 1.0:

            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

        if not 0.0 <= weight_decay:

            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(lr=lr, betas=betas, eps=eps,

                        weight_decay=weight_decay, amsgrad=amsgrad)

        super(Adam, self).__init__(params, defaults)


    def __setstate__(self, state):

        super(Adam, self).__setstate__(state)

        for group in self.param_groups:

            group.setdefault('amsgrad', False)


    @torch.no_grad()


    def step(self, closure=None):

        """Performs a single optimization step.

        Arguments:

            closure (callable, optional): A closure that reevaluates the model

                and returns the loss.

        """

        loss = None

        if closure is not None:

            with torch.enable_grad():

                loss = closure()


        for group in self.param_groups:

            if group['freeze']: continue


            for p in group['params']:

                if p.grad is None:

                    continue

                grad = p.grad

                if grad.is_sparse:

                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                amsgrad = group['amsgrad']


                state = self.state[p]


                # State initialization

                if len(state) == 0:

                    state['step'] = 0

                    # Exponential moving average of gradient values

                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)

                    # Exponential moving average of squared gradient values

                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

                    if amsgrad:

                        # Maintains max of all exp. moving avg. of sq. grad. values

                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)


                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']

                if amsgrad:

                    max_exp_avg_sq = state['max_exp_avg_sq']

                beta1, beta2 = group['betas']


                state['step'] += 1

                bias_correction1 = 1 - beta1 ** state['step']

                bias_correction2 = 1 - beta2 ** state['step']


                if group['weight_decay'] != 0:

                    grad = grad.add(p, alpha=group['weight_decay'])


                # Decay the first and second moment running average coefficient

                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)

                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                if amsgrad:

                    # Maintains the maximum of all 2nd moment running avg. till now

                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)

                    # Use the max. for normalizing running avg. of gradient

                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])

                else:

                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])


                step_size = group['lr'] / bias_correction1


                p.addcdiv_(exp_avg, denom, value=-step_size)


        return loss


fastreid.solver.optim.adam.Adam
Definition adam.py:7

fastreid.solver.optim.adam.Adam.step
step(self, closure=None)
Definition adam.py:54

fastreid.solver.optim.adam.Adam.__init__
__init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False)
Definition adam.py:33

fastreid.solver.optim.adam.Adam.__setstate__
__setstate__(self, state)
Definition adam.py:48