Safemotion Lib
Loading...
Searching...
No Matches
Functions
fastreid.engine.launch Namespace Reference

Functions

 _find_free_port ()
 
 launch (main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=())
 
 _distributed_worker (local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args)
 

Detailed Description

@author:  xingyu liao
@contact: sherlockliao01@gmail.com

Function Documentation

◆ _distributed_worker()

fastreid.engine.launch._distributed_worker ( local_rank,
main_func,
world_size,
num_gpus_per_machine,
machine_rank,
dist_url,
args )
protected

Definition at line 80 of file launch.py.

82):
83 print(f'local_rank = {local_rank}\n')
84 # exit(0)
85
86 assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
87 global_rank = machine_rank * num_gpus_per_machine + local_rank
88 try:
89 dist.init_process_group(
90 backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
91 )
92 except Exception as e:
93 logger = logging.getLogger(__name__)
94 logger.error("Process group URL: {}".format(dist_url))
95 raise e
96 # synchronize is needed here to prevent a possible timeout after calling init_process_group
97 # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
98 comm.synchronize()
99
100 assert num_gpus_per_machine <= torch.cuda.device_count()
101 torch.cuda.set_device(local_rank)
102
103 # Setup the local process group (which contains ranks within the same machine)
104 assert comm._LOCAL_PROCESS_GROUP is None
105 num_machines = world_size // num_gpus_per_machine
106 for i in range(num_machines):
107 ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
108 pg = dist.new_group(ranks_on_i)
109 if i == machine_rank:
110 comm._LOCAL_PROCESS_GROUP = pg
111
112 main_func(*args)

◆ _find_free_port()

fastreid.engine.launch._find_free_port ( )
protected

Definition at line 22 of file launch.py.

22def _find_free_port():
23 import socket
24
25 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
26 # Binding to port 0 will cause the OS to find an available port for us
27 sock.bind(("", 0))
28 port = sock.getsockname()[1]
29 sock.close()
30 # NOTE: there is still a chance the port could be taken by other processes.
31 return port
32
33

◆ launch()

fastreid.engine.launch.launch ( main_func,
num_gpus_per_machine,
num_machines = 1,
machine_rank = 0,
dist_url = None,
args = () )
Launch multi-gpu or distributed training.
This function must be called on all machines involved in the training.
It will spawn child processes (defined by ``num_gpus_per_machine`) on each machine.
Args:
    main_func: a function that will be called by `main_func(*args)`
    num_gpus_per_machine (int): number of GPUs per machine
    num_machines (int): the total number of machines
    machine_rank (int): the rank of this machine
    dist_url (str): url to connect to for distributed jobs, including protocol
                   e.g. "tcp://127.0.0.1:8686".
                   Can be set to "auto" to automatically select a free port on localhost
    args (tuple): arguments passed to main_func

Definition at line 34 of file launch.py.

34def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()):
35 """
36 Launch multi-gpu or distributed training.
37 This function must be called on all machines involved in the training.
38 It will spawn child processes (defined by ``num_gpus_per_machine`) on each machine.
39 Args:
40 main_func: a function that will be called by `main_func(*args)`
41 num_gpus_per_machine (int): number of GPUs per machine
42 num_machines (int): the total number of machines
43 machine_rank (int): the rank of this machine
44 dist_url (str): url to connect to for distributed jobs, including protocol
45 e.g. "tcp://127.0.0.1:8686".
46 Can be set to "auto" to automatically select a free port on localhost
47 args (tuple): arguments passed to main_func
48 """
49 world_size = num_machines * num_gpus_per_machine
50 if world_size > 1:
51 # https://github.com/pytorch/pytorch/pull/14391
52 # TODO prctl in spawned processes
53
54 if dist_url == "auto":
55 assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs."
56 port = _find_free_port()
57 dist_url = f"tcp://127.0.0.1:{port}"
58 if num_machines > 1 and dist_url.startswith("file://"):
59 logger = logging.getLogger(__name__)
60 logger.warning(
61 "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://"
62 )
63 # print(f'main_func = {main_func}\n')
64 print(f'num_gpus_per_machine = {num_gpus_per_machine}\n')
65 # print(f'num_machines = {num_machines}\n')
66 # print(f'machine_rank = {machine_rank}\n')
67 # print(f'dist_url = {dist_url}\n')
68 print(f'args = {args}\n')
69
70 mp.spawn(
71 _distributed_worker,
72 nprocs=num_gpus_per_machine,
73 args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args),
74 daemon=False,
75 )
76 else:
77 main_func(*args)
78
79