Skip to content

Commit

Permalink
Add new slurm sbatch profile
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jun 20, 2024
1 parent 7246295 commit 82c12f5
Show file tree
Hide file tree
Showing 11 changed files with 251 additions and 160 deletions.
53 changes: 53 additions & 0 deletions config/slurm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#
# SBatch arguments for different run profile
#

multi-node-full:
# DGX run: 2 nodes x 8 A100 80Go SXM4
- --partition=staff-idt
- -w cn-d[003-004]
- --ntasks=1
- --gpus-per-task=a100l:8
- --exclusive
- --nodes=2
- --cpus-per-task=128
- --time=1:30:00
- --ntasks-per-node=1
- --mem=0

single-node-full:
# DGX run: 1 node x 8 A100 80Go SXM4
- --partition=staff-idt
- -w cn-d[003-004]
- --ntasks=1
- --gpus-per-task=a100l:8
- --exclusive
- --nodes=1
- --cpus-per-task=128
- --time=1:30:00
- --ntasks-per-node=1
- --mem=0

multi-node-small:
# Any GPU, 2 nodes x 2 GPU
- --partition=staff-idt
- --ntasks=1
- --gpus-per-task=2
- --exclusive
- --nodes=2
- --cpus-per-task=16
- --time=1:30:00
- --ntasks-per-node=1
- --mem=64G

single-node-small:
# Any GPU, 1 node x 2 GPU
- --partition=staff-idt
- --ntasks=1
- --gpus-per-task=2
- --exclusive
- --nodes=1
- --cpus-per-task=16
- --time=1:30:00
- --ntasks-per-node=1
- --mem=64G
6 changes: 3 additions & 3 deletions milabench/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""This file is generated, do not modify"""

__tag__ = "v0.1.0-12-g39e7cce9"
__commit__ = "39e7cce9aec8a9e1ae7713137f287353ce718875"
__date__ = "2024-06-17 13:41:35 -0400"
__tag__ = "v0.1.0-20-g7246295a"
__commit__ = "7246295a356186b55fa4b2b75480e3700c279b15"
__date__ = "2024-06-20 09:18:17 -0400"
2 changes: 1 addition & 1 deletion milabench/cli/pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from coleo import Option, tooled

from ..common import _short_make_report
from ..schedule import post_comment_on_pr
from .schedule import post_comment_on_pr


# fmt: off
Expand Down
49 changes: 30 additions & 19 deletions milabench/cli/schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import importlib_resources
import requests
import yaml
from coleo import Option, tooled


Expand All @@ -14,6 +15,7 @@ class Arguments:
sync: bool = False
dry : bool = False
args: list = field(default_factory=list)
profile: str = None
# fmt: on


Expand All @@ -25,11 +27,29 @@ def arguments():
# Print the command and return without running it
dry: Option & bool = False

# pip arguments
# sbatch run profile
profile: Option & str = None

# script arguments
# [remainder]
args: Option = []

return Arguments(sync, dry, args)
return Arguments(sync, dry, args, profile)


def get_sbatch_profiles(profile, default):
ROOT = os.path.dirname(__file__)
default_scaling_config = os.path.join(ROOT, "..", "..", "config", "slurm.yaml")

with open(default_scaling_config, "r") as fp:
sbatch_profiles = yaml.safe_load(fp)

args = sbatch_profiles.get(profile)

if args is None:
args = sbatch_profiles.get(default)

return args


@tooled
Expand All @@ -39,9 +59,9 @@ def cli_schedule(args=None):
if args is None:
args = arguments()

launch_milabench(args.args, sbatch_args=None, dry=args.dry, sync=args.sync)

sbatch_args = get_sbatch_profiles(args.profile, "single-node-small")

launch_milabench(args.args, sbatch_args=sbatch_args, dry=args.dry, sync=args.sync)


def popen(cmd, callback=None):
Expand Down Expand Up @@ -119,7 +139,8 @@ class SetupOptions:
origin: str = "https://github.com/mila-iqia/milabench.git"
config: str = "milabench/config/standard.yaml"
env: str = "./env"
python: str = "3.9"
python: str = "3.10"
fun: str = "run"

def deduce_remote(self, current_branch):
prefix = "refs/heads/"
Expand Down Expand Up @@ -164,35 +185,25 @@ def arguments(self):
self.env,
"-p",
self.python,
"-f",
self.fun
]


def launch_milabench(args, sbatch_args=None, dry: bool = False, sync: bool = False):
sbatch_script = (
importlib_resources.files(__name__) / "scripts" / "milabench_run.bash"
os.path.abspath(importlib_resources.files(__name__) / ".." / "scripts" / "milabench_run.bash")
)
sbatch_script = str(sbatch_script)

# salloc --gres=gpu:rtx8000:1 --mem=64G --cpus-per-gpu=4

if sbatch_args is None:
sbatch_args = [
"--ntasks=1",
"--gpus-per-task=rtx8000:2",
"--cpus-per-task=8",
"--time=01:30:00",
"--ntasks-per-node=1",
"--mem=64G",
]

script_args = SetupOptions()
script_args.deduce_from_repository()
script_args = script_args.arguments()

cmd = sbatch_args + [sbatch_script] + script_args + args
print("sbatch " + " ".join(cmd))

if dry:
print("sbatch " + " ".join(cmd))
code = 0
else:
code, _ = sbatch(cmd, sync=sync, tags=None)
Expand Down
70 changes: 62 additions & 8 deletions milabench/cli/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import os

from coleo import tooled
from voir.instruments.gpu import get_gpu_info

from ..slurm import expand_node_list
from ..system import get_gpu_capacity


@tooled
Expand All @@ -26,18 +25,73 @@ def make_node(i, ip):

return node

capacity = float("+inf")

for _, v in get_gpu_info("cuda")["gpus"].items():
capacity = min(v["memory"]["total"], capacity)

# nvidia-smi --query-gpu=memory.total --format=csv
system = {
"arch": "cuda",
"gpu": {"capacity": f"{int(capacity)} MiB"},
"nodes": [make_node(i, ip) for i, ip in enumerate(node_list)],
}

capacity = get_gpu_capacity()
if capacity > 0:
system["gpu"] = {
"capacity": f"{capacity} MiB"
}

import yaml

print(yaml.dump({"system": system}))


def expand_range(s):
numbers = []
count = 0

for i in s.split(","):
if "-" not in i:
count = len(i)
numbers.append(i)
else:
start, end = i.split("-")
count = len(start)

for n in range(int(start), int(end) + 1):
numbers.append(f"{n:0{count}d}")

return numbers


def expand_node_list(node_list):
nodes = []
s = 0

while s < len(node_list):
if node_list[s] == ",":
s += 1

next = node_list.find(",", s)
range_start = node_list.find("[", s)
range_end = node_list.find("]", s)

# Found a range
if range_start != -1 and (next == -1 or range_start < next):
node_name = node_list[s:range_start]

range = node_list[range_start + 1 : range_end]

for i in expand_range(range):
nodes.append(f"{node_name}{i}")

# eat the ]
s = range_end + 1

else:
if next == -1:
next = len(node_list)

node_name = node_list[s:next]
nodes.append(node_name)

# eat the ,
s = next + 1

return nodes
2 changes: 1 addition & 1 deletion milabench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_run_count():

def get_base_folder():
config = config_global.get()
return XPath(config["dirs"]["base"])
return XPath(config["_defaults"]["dirs"]["base"])

def relative_to(pth, cwd):
pth = XPath(pth).expanduser()
Expand Down
Empty file removed milabench/schedule.py
Empty file.
Loading

0 comments on commit 82c12f5

Please sign in to comment.