Skip to content

Commit

Permalink
Make torchrun use docker in multinode
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jan 17, 2025
1 parent cf751f7 commit d73af7d
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
5 changes: 3 additions & 2 deletions config/examples/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ system:
# sshkey used in remote milabench operations
sshkey: ~/.ssh/id_ed25519

# Configures how to use docker
docker:
executable: podman
image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly
base: /tmp/workspace/
base: /tmp/workspace
args: [
-it, --rm, --ipc=host, --gpus=all,
-it, --rm, --ipc=host, --gpus=all, --network, host, --privileged,
-e, MILABENCH_HF_TOKEN=<TOKEN>,
-v, "${system.docker.base}/data:/milabench/envs/data",
-v, "${system.docker.base}/runs:/milabench/envs/runs",
Expand Down
5 changes: 3 additions & 2 deletions milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def node_address(node):
"""Favour Hostname as it is the most consistent name across machines"""
host = node.get("hostname")
ip = node.get("ip")
return ip or hostname
return ip or host


class ForeachNode(ListCommand):
Expand Down Expand Up @@ -706,7 +706,8 @@ def make_new_node_executor(self, rank, node, base):
]
executor.wrapper_argv = new_args

return executor
config = executor.pack.config
return DockerRunCommand(executor, DockerConfig(**config["system"].get("docker")))

def __init__(self, executor: Command, *args, **kwargs) -> None:
base_exec = TorchrunAllNodes.make_base_executor(
Expand Down

0 comments on commit d73af7d

Please sign in to comment.