mcr_dl_megatron changes

RadhaGulhane13 · RadhaGulhane13 · commit 1982e59d02dd · 2024-04-19T20:50:16.000-04:00
Signed-off-by: Radha Guhane &lt;gulhane.2@buckeyemail.osu.edu&gt;
diff --git a/mcr_dl/__init__.py b/mcr_dl/__init__.py
@@ -22,30 +22,33 @@
 global __dist_engine
 global __dist_backend
 
+__dist_engine = None
+__dist_backend = None
+
 def init_torch_distributed(backend):
     import torch.distributed as dist
     if backend == 'nccl':
         mpi_discovery()
     elif backend == 'mpi':
         set_mpi_dist_environemnt()
-    dist.init_process_group(backend)
+    dist.init_process_group(backend=backend)
     local_rank = int(os.environ['LOCAL_RANK'])
-    get_accelerator().set_device(local_rank)
+    # get_accelerator().set_device(local_rank)
+    print(f'Rank : {dist.get_rank()}  World_Size : {dist.get_world_size()}', flush = True)
 
 def init_mcr_dl_comm(backend):
     import mcr_dl
     mcr_dl.init_distributed(dist_backend=backend, use_mcr_dl=True)
     local_rank = int(os.environ['LOCAL_RANK'])
     #get_accelerator().set_device(local_rank)
 
-def init_processes(dist_engine, dist_backend):
+def init_processes(dist_engine, dist_backend, world_size = -1, rank = -1, timeout = None, init_method = None):
     print(f'Comm : {dist_engine}  Backend : {dist_backend}')
 
     global __dist_engine
     global __dist_backend
     __dist_engine = dist_engine
     __dist_backend = dist_backend
-
     if dist_engine == 'mcr_dl':
         init_mcr_dl_comm(dist_backend)
     elif dist_engine == 'torch':
@@ -56,8 +59,12 @@ def init_processes(dist_engine, dist_backend):
 
 def get_distributed_engine():
     global __dist_engine
+    if  __dist_engine is None:
+        return None
     if __dist_engine == 'torch':
         return torch.distributed
     elif __dist_engine == 'mcr_dl':
         import mcr_dl
-        return mcr_dl
+        return mcr_dl
+    print(f"Unsupported values for __dist_engine. Expected values 'torch' or 'mcr_dl'")
+    exit(0)
diff --git a/mcr_dl/constants.py b/mcr_dl/constants.py
@@ -70,7 +70,7 @@
 #############################################
 # Torch distributed constants
 #############################################
-TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
+TORCH_DISTRIBUTED_DEFAULT_PORT = 29600
 
 # Default process group wide timeout, if applicable.
 # This only applies to the gloo and nccl backends
diff --git a/mcr_dl/mpi.py b/mcr_dl/mpi.py
@@ -72,12 +72,13 @@ def destroy_process_group(self, group=None):
         pass
 
     def new_group(self, ranks):
-        # TODO: Change this to use comm_op.new_group when the impl. is ready.
+        # TODO: Change this to use self.mpi_comm_op.new_group(ranks) when the impl. is ready.
         if not torch.distributed.is_initialized():
             from mcr_dl.torch import TorchBackend
-            d = TorchBackend(rank=self.rank, size=self.size)
+            d = TorchBackend(rank=self.rank, world_size=self.size)
         logger.info(f"new group called with {ranks}")
         return torch.distributed.new_group(ranks)
+        # return self.mpi_comm_op.new_group(ranks)
 
     def get_rank(self, group=None):
         return self.mpi_comm_op.get_rank(0)
diff --git a/mcr_dl/torch.py b/mcr_dl/torch.py
@@ -23,6 +23,7 @@
 from .utils import *
 from .backend import *
 from .comm import *
+from .constants import default_pg_timeout
 
 DS_COMM_ALL_GATHER_OFF = False
 DS_COMM_REDUCE_SCATTER_OFF = False
@@ -119,7 +120,7 @@ class TorchBackend(Backend):
         needed.
     """
 
-    def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'):
+    def __init__(self, backend="mpi", init_method = None, timeout = default_pg_timeout, rank=-1, world_size=-1, name='torch'):
         super(TorchBackend, self).__init__()
         self.has_all_reduce_coalesced = has_all_reduce_coalesced()
         self.has_coalescing_manager = has_coalescing_manager()
@@ -131,7 +132,7 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='
         # The idea is to fake that dist backend is initialized even when
         # it is not so we can run on a single GPU without doing any init_process_group
         self.single_gpu_mode = True
-        self.init_process_group(backend, timeout, init_method, rank, world_size)
+        self.init_process_group(backend=backend, init_method=init_method, timeout= timeout, rank=rank, world_size= world_size)
 
     @classmethod
     def get_all_gather_function(self):