da/db6/train__loop_8py_source.html

# encoding: utf-8

"""

credit:

https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/train_loop.py

"""


import logging

import time

import weakref


import numpy as np

import torch

from torch.cuda import amp

from torch.nn.parallel import DistributedDataParallel


import fastreid.utils.comm as comm

from fastreid.utils.events import EventStorage


__all__ = ["HookBase", "TrainerBase", "SimpleTrainer"]


class HookBase:

    """

    Base class for hooks that can be registered with :class:`TrainerBase`.

    Each hook can implement 4 methods. The way they are called is demonstrated

    in the following snippet:

    .. code-block:: python

        hook.before_train()

        for iter in range(start_iter, max_iter):

            hook.before_step()

            trainer.run_step()

            hook.after_step()

        hook.after_train()

    Notes:

        1. In the hook method, users can access `self.trainer` to access more

           properties about the context (e.g., current iteration).

        2. A hook that does something in :meth:`before_step` can often be

           implemented equivalently in :meth:`after_step`.

           If the hook takes non-trivial time, it is strongly recommended to

           implement the hook in :meth:`after_step` instead of :meth:`before_step`.

           The convention is that :meth:`before_step` should only take negligible time.

           Following this convention will allow hooks that do care about the difference

           between :meth:`before_step` and :meth:`after_step` (e.g., timer) to

           function properly.

    Attributes:

        trainer: A weak reference to the trainer object. Set by the trainer when the hook is

            registered.

    """


    def before_train(self):

        """

        Called before the first iteration.

        """

        pass


    def after_train(self):

        """

        Called after the last iteration.

        """

        pass


    def before_step(self):

        """

        Called before each iteration.

        """

        pass


    def after_step(self):

        """

        Called after each iteration.

        """

        pass


class TrainerBase:

    """

    Base class for iterative trainer with hooks.

    The only assumption we made here is: the training runs in a loop.

    A subclass can implement what the loop is.

    We made no assumptions about the existence of dataloader, optimizer, model, etc.

    Attributes:

        iter(int): the current iteration.

        start_iter(int): The iteration to start with.

            By convention the minimum possible value is 0.

        max_iter(int): The iteration to end training.

        storage(EventStorage): An EventStorage that's opened during the course of training.

    """


    def __init__(self):

        self._hooks = []


    def register_hooks(self, hooks):

        """

        Register hooks to the trainer. The hooks are executed in the order

        they are registered.

        Args:

            hooks (list[Optional[HookBase]]): list of hooks

        """

        hooks = [h for h in hooks if h is not None]

        for h in hooks:

            assert isinstance(h, HookBase)

            # To avoid circular reference, hooks and trainer cannot own each other.

            # This normally does not matter, but will cause memory leak if the

            # involved objects contain __del__:

            # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/

            h.trainer = weakref.proxy(self)

        self._hooks.extend(hooks)


    def train(self, start_iter: int, max_iter: int):

        """

        Args:

            start_iter, max_iter (int): See docs above

        """

        logger = logging.getLogger(__name__)

        logger.info("Starting training from iteration {}".format(start_iter))


        self.iter = self.start_iter = start_iter

        self.max_iter = max_iter


        with EventStorage(start_iter) as self.storage:

            try:

                self.before_train()

                for self.iter in range(start_iter, max_iter):

                    self.before_step()

                    self.run_step()

                    self.after_step()

            except Exception:

                logger.exception("Exception during training:")

            finally:

                self.after_train()


    def before_train(self):

        for h in self._hooks:

            h.before_train()


    def after_train(self):

        for h in self._hooks:

            h.after_train()


    def before_step(self):

        for h in self._hooks:

            h.before_step()


    def after_step(self):

        for h in self._hooks:

            h.after_step()

        # this guarantees, that in each hook's after_step, storage.iter == trainer.iter

        self.storage.step()


    def run_step(self):

        raise NotImplementedError


class SimpleTrainer(TrainerBase):

    """

    A simple trainer for the most common type of task:

    single-cost single-optimizer single-data-source iterative optimization.

    It assumes that every step, you:

    1. Compute the loss with a data from the data_loader.

    2. Compute the gradients with the above loss.

    3. Update the model with the optimizer.

    If you want to do anything fancier than this,

    either subclass TrainerBase and implement your own `run_step`,

    or write your own training loop.

    """


    def __init__(self, model, data_loader, optimizer, amp_enabled):

        """

        Args:

            model: a torch Module. Takes a data from data_loader and returns a

                dict of heads.

            data_loader: an iterable. Contains data to be used to call model.

            optimizer: a torch optimizer.

        """

        super().__init__()


        """

        We set the model to training mode in the trainer.

        However it's valid to train a model that's in eval mode.

        If you want your model (or a submodule of it) to behave

        like evaluation during training, you can overwrite its train() method.

        """

        model.train()


        self.model = model

        self.data_loader = data_loader

        self._data_loader_iter = iter(data_loader)

        self.optimizer = optimizer

        self.amp_enabled = amp_enabled


        if amp_enabled:

            # Creates a GradScaler once at the beginning of training.

            self.scaler = amp.GradScaler()


    def run_step(self):

        """

        Implement the standard training logic described above.

        """

        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"

        start = time.perf_counter()

        """

        If your want to do something with the data, you can wrap the dataloader.

        """

        data = next(self._data_loader_iter)

        data_time = time.perf_counter() - start


        """

        If your want to do something with the heads, you can wrap the model.

        """


        with amp.autocast(enabled=self.amp_enabled):

            outs = self.model(data)


            # Compute loss

            if isinstance(self.model, DistributedDataParallel):

                loss_dict = self.model.module.losses(outs)

            else:

                loss_dict = self.model.losses(outs)


            losses = sum(loss_dict.values())


        with torch.cuda.stream(torch.cuda.Stream()):

            metrics_dict = loss_dict

            metrics_dict["data_time"] = data_time

            self._write_metrics(metrics_dict)

            self._detect_anomaly(losses, loss_dict)


        """

        If you need accumulate gradients or something similar, you can

        wrap the optimizer with your custom `zero_grad()` method.

        """

        self.optimizer.zero_grad()


        if self.amp_enabled:

            self.scaler.scale(losses).backward()

            self.scaler.step(self.optimizer)

            self.scaler.update()

        else:

            losses.backward()

            """

            If you need gradient clipping/scaling or other processing, you can

            wrap the optimizer with your custom `step()` method.

            """

            self.optimizer.step()


    def _detect_anomaly(self, losses, loss_dict):

        if not torch.isfinite(losses).all():

            raise FloatingPointError(

                "Loss became infinite or NaN at iteration={}!\nloss_dict = {}".format(

                    self.iteriter, loss_dict

                )

            )


    def _write_metrics(self, metrics_dict: dict):

        """

        Args:

            metrics_dict (dict): dict of scalar metrics

        """

        metrics_dict = {

            k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v)

            for k, v in metrics_dict.items()

        }

        # gather metrics among all workers for logging

        # This assumes we do DDP-style training, which is currently the only

        # supported method in fastreid.

        all_metrics_dict = comm.gather(metrics_dict)


        if comm.is_main_process():

            if "data_time" in all_metrics_dict[0]:

                # data_time among workers can have high variance. The actual latency

                # caused by data_time is the maximum among workers.

                data_time = np.max([x.pop("data_time") for x in all_metrics_dict])

                self.storage.put_scalar("data_time", data_time)


            # average the rest metrics

            metrics_dict = {

                k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()

            }

            total_losses_reduced = sum(loss for loss in metrics_dict.values())


            self.storage.put_scalar("total_loss", total_losses_reduced)

            if len(metrics_dict) > 1:

                self.storage.put_scalars(**metrics_dict)


fastreid.engine.train_loop.HookBase
Definition train_loop.py:22

fastreid.engine.train_loop.HookBase.before_step
before_step(self)
Definition train_loop.py:62

fastreid.engine.train_loop.HookBase.after_train
after_train(self)
Definition train_loop.py:56

fastreid.engine.train_loop.HookBase.after_step
after_step(self)
Definition train_loop.py:68

fastreid.engine.train_loop.HookBase.before_train
before_train(self)
Definition train_loop.py:50

fastreid.engine.train_loop.SimpleTrainer
Definition train_loop.py:154

fastreid.engine.train_loop.SimpleTrainer._detect_anomaly
_detect_anomaly(self, losses, loss_dict)
Definition train_loop.py:246

fastreid.engine.train_loop.SimpleTrainer._write_metrics
_write_metrics(self, dict metrics_dict)
Definition train_loop.py:254

fastreid.engine.train_loop.SimpleTrainer.optimizer
optimizer
Definition train_loop.py:188

fastreid.engine.train_loop.SimpleTrainer.scaler
scaler
Definition train_loop.py:193

fastreid.engine.train_loop.SimpleTrainer.model
model
Definition train_loop.py:185

fastreid.engine.train_loop.SimpleTrainer.__init__
__init__(self, model, data_loader, optimizer, amp_enabled)
Definition train_loop.py:167

fastreid.engine.train_loop.SimpleTrainer.iter
iter
Definition train_loop.py:250

fastreid.engine.train_loop.SimpleTrainer.amp_enabled
amp_enabled
Definition train_loop.py:189

fastreid.engine.train_loop.SimpleTrainer._data_loader_iter
_data_loader_iter
Definition train_loop.py:187

fastreid.engine.train_loop.SimpleTrainer.data_loader
data_loader
Definition train_loop.py:186

fastreid.engine.train_loop.SimpleTrainer.run_step
run_step(self)
Definition train_loop.py:195

fastreid.engine.train_loop.TrainerBase
Definition train_loop.py:75

fastreid.engine.train_loop.TrainerBase.train
train(self, int start_iter, int max_iter)
Definition train_loop.py:109

fastreid.engine.train_loop.TrainerBase.before_step
before_step(self)
Definition train_loop.py:140

fastreid.engine.train_loop.TrainerBase._hooks
_hooks
Definition train_loop.py:90

fastreid.engine.train_loop.TrainerBase.run_step
run_step(self)
Definition train_loop.py:150

fastreid.engine.train_loop.TrainerBase.max_iter
max_iter
Definition train_loop.py:118

fastreid.engine.train_loop.TrainerBase.__init__
__init__(self)
Definition train_loop.py:89

fastreid.engine.train_loop.TrainerBase.before_train
before_train(self)
Definition train_loop.py:132

fastreid.engine.train_loop.TrainerBase.after_train
after_train(self)
Definition train_loop.py:136

fastreid.engine.train_loop.TrainerBase.register_hooks
register_hooks(self, hooks)
Definition train_loop.py:92

fastreid.engine.train_loop.TrainerBase.iter
iter
Definition train_loop.py:117

fastreid.engine.train_loop.TrainerBase.after_step
after_step(self)
Definition train_loop.py:144

fastreid.utils.events.EventStorage
Definition events.py:250

fastreid.utils.comm
Definition comm.py:1

fastreid.utils.events
Definition events.py:1