10from collections
import Counter
14from torch.nn.parallel
import DistributedDataParallel
24from .train_loop
import HookBase
30 "PeriodicCheckpointer",
39Implement some common hooks.
45 Create a hook using callback functions provided by the user.
48 def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
50 Each argument is a function that takes one argument: the trainer.
80 Track the time spent for each iteration (each run_step call in the trainer).
81 Print a summary in the end of training.
82 This hook uses the time between the call to its :meth:`before_step`
83 and :meth:`after_step` methods.
84 Under the convention that :meth:`before_step` of all hooks should only
85 take negligible amount of time, the :class:`IterationTimer` hook should be
86 placed at the beginning of the list of hooks to obtain accurate timing.
92 warmup_iter (int): the number of iterations at the beginning to exclude
104 logger = logging.getLogger(__name__)
105 total_time = time.perf_counter() - self.
_start_time
107 hook_time = total_time - total_time_minus_hooks
109 num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self.
_warmup_iter
111 if num_iter > 0
and total_time_minus_hooks > 0:
115 "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
117 str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
118 total_time_minus_hooks / num_iter,
123 "Total training time: {} ({} on hooks)".format(
124 str(datetime.timedelta(seconds=int(total_time))),
125 str(datetime.timedelta(seconds=int(hook_time))),
135 iter_done = self.trainer.iter - self.trainer.start_iter + 1
138 self.trainer.storage.put_scalars(time=sec)
148 Write events to EventStorage periodically.
149 It is executed every ``period`` iterations and after the last iteration.
155 writers (list[EventWriter]): a list of EventWriter objects
160 assert isinstance(w, EventWriter), w
164 if (self.trainer.iter + 1) % self.
_period == 0
or (
165 self.trainer.iter == self.trainer.max_iter - 1
177 Same as :class:`fastreid.utils.checkpoint.PeriodicCheckpointer`, but as a hook.
178 Note that when used as a hook,
179 it is unable to save additional data other than what's defined
180 by the given `checkpointer`.
181 It is executed every ``period`` iterations and after the last iteration.
189 self.step(self.trainer.iter)
194 A hook which executes a torch builtin LR scheduler and summarizes the LR.
195 It is executed after every iteration.
201 optimizer (torch.optim.Optimizer):
202 scheduler (torch.optim._LRScheduler)
209 largest_group = max(len(g[
"params"])
for g
in optimizer.param_groups)
211 if largest_group == 1:
214 lr_count = Counter([g[
"lr"]
for g
in optimizer.param_groups])
215 lr = lr_count.most_common()[0][0]
216 for i, g
in enumerate(optimizer.param_groups):
221 for i, g
in enumerate(optimizer.param_groups):
222 if len(g[
"params"]) == largest_group:
228 self.trainer.storage.put_scalar(
"lr", lr, smoothing_hint=
False)
234 A hook which runs `torch.autograd.profiler.profile`.
236 .. code-block:: python
237 hooks.AutogradProfiler(
238 lambda trainer: trainer.iter > 10 and trainer.iter < 20, self.cfg.OUTPUT_DIR
240 The above example will run the profiler for iteration 10~20 and dump
241 results to ``OUTPUT_DIR``. We did not profile the first few iterations
242 because they are typically slower than the rest.
243 The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
245 When used together with NCCL on older version of GPUs,
246 autograd profiler may cause deadlock because it unnecessarily allocates
247 memory on every device it sees. The memory management calls, if
248 interleaved with NCCL calls, lead to deadlock on GPUs that do not
249 support `cudaLaunchCooperativeKernelMultiDevice`.
252 def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
255 enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
256 and returns whether to enable the profiler.
257 It will be called once every step, and can be used to select which steps to profile.
258 output_dir (str): the output directory to dump tracing files.
259 use_cuda (bool): same as in `torch.autograd.profiler.profile`.
275 self.
_profiler.__exit__(
None,
None,
None)
276 out_file = os.path.join(
279 if "://" not in out_file:
280 self.
_profiler.export_chrome_trace(out_file)
283 with tempfile.TemporaryDirectory(prefix=
"fastreid_profiler")
as d:
284 tmp_file = os.path.join(d,
"tmp.json")
285 self.
_profiler.export_chrome_trace(tmp_file)
286 with open(tmp_file)
as f:
288 with PathManager.open(out_file,
"w")
as f:
294 Run an evaluation function periodically, and at the end of training.
295 It is executed every ``eval_period`` iterations and after the last iteration.
301 eval_period (int): the period to run `eval_function`.
302 eval_function (callable): a function which takes no arguments, and
303 returns a nested dict of evaluation metrics.
305 This hook must be enabled in all or none workers.
306 If you would like only certain workers to perform evaluation,
307 give other workers a no-op function (`eval_function=lambda: None`).
313 results = self.
_func()
318 ),
"Eval function must return a dict. Got {} instead.".format(results)
320 flattened_results = flatten_results_dict(results)
321 for k, v
in flattened_results.items():
326 "[EvalHook] eval_function should return a nested dict of float. "
327 "Got '{}: {}' instead.".format(k, v)
329 self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=
False)
332 torch.cuda.empty_cache()
335 next_iter = self.trainer.iter + 1
336 is_final = next_iter == self.trainer.max_iter
337 if is_final
or (self.
_period > 0
and next_iter % self.
_period == 0):
351 The standard implementation of BatchNorm uses EMA in inference, which is
352 sometimes suboptimal.
353 This class computes the true average of statistics rather than the moving average,
354 and put true averages to every BN layer in the given model.
355 It is executed after the last iteration.
361 model (nn.Module): a module whose all BN layers in training mode will be
362 updated by precise BN.
363 Note that user is responsible for ensuring the BN layers to be
364 updated are in training mode when this hook is triggered.
365 data_loader (iterable): it will produce data to be run by `model(data)`.
366 num_iter (int): number of iterations used to compute the precise
370 if len(get_bn_modules(model)) == 0:
372 "PreciseBN is disabled because model does not contain BN layers in training mode."
385 next_iter = self.trainer.iter + 1
386 is_final = next_iter == self.trainer.max_iter
392 Update the model with precise statistics. Users can manually call this method.
401 for num_iter
in itertools.count(1):
402 if num_iter % 100 == 0:
404 "Running precise-BN ... {}/{} iterations.".format(num_iter, self.
_num_iter)
411 "Running precise-BN for {} iterations... ".format(self.
_num_iter)
412 +
"Note that this could produce different statistics every time."
418 def __init__(self, model, optimizer, freeze_layers, freeze_iters):
421 if isinstance(model, DistributedDataParallel):
431 for param_group
in self.
optimizer.param_groups:
432 param_name = param_group[
'name']
433 param_freeze[param_name] = param_group[
'freeze']
449 if not hasattr(self.
model, layer):
450 self.
_logger.info(f
'{layer} is not an attribute of the model, will skip this layer')
452 for param_group
in self.
optimizer.param_groups:
453 param_name = param_group[
'name']
455 param_group[
'freeze'] =
True
458 for name, module
in self.
model.named_children():
465 for param_group
in self.
optimizer.param_groups:
466 param_name = param_group[
'name']
473 def __init__(self, swa_start: int, swa_freq: int, swa_lr_factor: float, eta_min: float, lr_sched=
False, ):
481 is_swa = self.trainer.iter == self.
swa_start
485 self.trainer.optimizer.reset_lr_to_swa()
488 self.
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
489 optimizer=self.trainer.optimizer,
495 next_iter = self.trainer.iter + 1
501 is_final = next_iter == self.trainer.max_iter
503 self.trainer.optimizer.swap_swa_param()
__init__(self, enable_predicate, output_dir, *use_cuda=True)
__init__(self, *before_train=None, after_train=None, before_step=None, after_step=None)
__init__(self, eval_period, eval_function)
freeze_specific_layer(self)
__init__(self, model, optimizer, freeze_layers, freeze_iters)
__init__(self, warmup_iter=3)
__init__(self, optimizer, scheduler)
__init__(self, writers, period=20)
__init__(self, model, data_loader, num_iter)
__init__(self, int swa_start, int swa_freq, float swa_lr_factor, float eta_min, lr_sched=False)