ServiceNow
diff --git a/‎fast_llm/config.py‎
Lines changed: 0 additions & 1 deletion b/‎fast_llm/config.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎setup.cfg‎
Lines changed: 1 addition & 0 deletions b/‎setup.cfg‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/common.py‎
Lines changed: 76 additions & 70 deletions b/‎tests/common.py‎
Lines changed: 76 additions & 70 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 105 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 105 additions & 0 deletions
@@ -274,7 +274,6 @@ def __init__(self, **kwargs):
 
         if dynamic_type is not None:
             for cls_, name in dynamic_type.items():
-                print(cls_, name, wrapped)
                 cls_.register_subclass(name, wrapped)
 
         return wrapped
 
@@ -51,6 +51,7 @@ DEV =
     # Required for testing
     pytest>=8.3.2
     pytest-depends>=1.0.1
+    pytest-xdist>=3.6.1
     # Somehow needed for Megatron to work with base image 24.11
     setuptools>=75.6.0
 
 
@@ -32,7 +32,7 @@
 
 # TODO: Use `pytest_addoption` instead?
 # Keep all results in one place to allow recovering them for debugging in case of failure.
-TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests"))
+TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")).resolve()
 FORCE_REUSE_RESULTS = int(os.environ.get("FORCE_REUSE_RESULTS", 0)) != 0
 REUSE_RESULTS = FORCE_REUSE_RESULTS or int(os.environ.get("REUSE_RESULTS", 0)) != 0
 _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13))
@@ -350,78 +350,84 @@ def get_test_concatenated_memmap_dataset(
         index_file.open("w").writelines([str(path / f"dataset_{i}") + "\n" for i in range(num_files)])
 
 
-def run_test_script(
-    name: str,
-    script: list[str],
-    num_gpus: int = 1,
-    *,
-    model_type: str = TEST_MODEL_TYPE,
-    is_megatron: bool = False,
-    compare: str | None = None,
-    config: CompareConfig | None = None,
-    prepare_fn=None,
-    compare_fn=None,
-    do_compare: bool = True,
-):
-    if torch.cuda.device_count() < num_gpus:
-        pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})")
-    env = os.environ.copy()
-    if is_megatron:
-        # Prevent Megatron from complaining.
-        env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
-        env["NVTE_FLASH_ATTN"] = "0"
-    path = TEST_RESULTS_PATH.resolve() / name
-    skip = False
-    artifact_path = path / ARTIFACT_PATH
-    if path.exists():
-        assert path.is_dir()
-        # TODO: Better way to check if the previous attempt succeeded.
-        if (
-            REUSE_RESULTS
-            and artifact_path.is_dir()
-            and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3)
-        ):
-            skip = True
+@pytest.fixture(scope="session")
+def run_test_script(worker_resources):
+    def do_run_test_script(
+        name: str,
+        script: list[str],
+        num_gpus: int = 1,
+        *,
+        model_type: str = TEST_MODEL_TYPE,
+        is_megatron: bool = False,
+        compare: str | None = None,
+        config: CompareConfig | None = None,
+        prepare_fn=None,
+        compare_fn=None,
+        do_compare: bool = True,
+    ):
+        if torch.cuda.device_count() < num_gpus:
+            pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})")
+        env = os.environ.copy()
+        if is_megatron:
+            # Prevent Megatron from complaining.
+            env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+            env["NVTE_FLASH_ATTN"] = "0"
+        path = TEST_RESULTS_PATH / name
+        skip = False
+        artifact_path = path / ARTIFACT_PATH
+        if path.exists():
+            assert path.is_dir()
+            # TODO: Better way to check if the previous attempt succeeded.
+            if (
+                REUSE_RESULTS
+                and artifact_path.is_dir()
+                and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3)
+            ):
+                skip = True
+            elif FORCE_REUSE_RESULTS:
+                raise RuntimeError(artifact_path)
+            else:
+                shutil.rmtree(path)
         elif FORCE_REUSE_RESULTS:
-            raise RuntimeError(artifact_path)
+            raise RuntimeError(path)
+        if prepare_fn is not None:
+            skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip)
+        if is_megatron:
+            script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"]
         else:
-            shutil.rmtree(path)
-    elif FORCE_REUSE_RESULTS:
-        raise RuntimeError(path)
-    if prepare_fn is not None:
-        skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip)
-    if is_megatron:
-        script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"]
-    else:
-        script = [model_type, *script, f"run.experiment_dir={path}"]
-    header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"]
-    command = [
-        "python",
-        "-m",
-        "torch.distributed.run",
-        f"--nproc-per-node={num_gpus}",
-        *header,
-        *script,
-    ]
-    print(" ".join(command))
-    if skip:
-        print("Reusing existing run.")
-    else:
-        get_test_dataset()
-        if num_gpus == 1 and not is_megatron:
-            CliTrainingConfig.parse_and_run(script)
+            script = [model_type, *script, f"run.experiment_dir={path}"]
+        header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"]
+        command = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            f"--nproc-per-node={num_gpus}",
+            f"--rdzv-endpoint=localhost:{worker_resources.rendezvous_port}",
+            f"--master-port={worker_resources.torchrun_port}",
+            *header,
+            *script,
+        ]
+        print(" ".join(command))
+        if skip:
+            print("Reusing existing run.")
         else:
-            completed_proc = subprocess.run(command, env=env, timeout=60)
-            if completed_proc.returncode:
-                raise RuntimeError(f"Process failed with return code {completed_proc.returncode}")
-    if compare and do_compare:
-        if compare_fn is not None:
-            compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare)
-        compare_tensor_logs(
-            TEST_RESULTS_PATH / compare / ARTIFACT_PATH,
-            TEST_RESULTS_PATH / name / ARTIFACT_PATH,
-            config,
-        )
+            get_test_dataset()
+            if num_gpus == 1 and not is_megatron:
+                CliTrainingConfig.parse_and_run(script)
+            else:
+                completed_proc = subprocess.run(command, env=env, timeout=60)
+                if completed_proc.returncode:
+                    raise RuntimeError(f"Process failed with return code {completed_proc.returncode}")
+        if compare and do_compare:
+            if compare_fn is not None:
+                compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare)
+            compare_tensor_logs(
+                TEST_RESULTS_PATH / compare / ARTIFACT_PATH,
+                TEST_RESULTS_PATH / name / ARTIFACT_PATH,
+                config,
+            )
+
+    return do_run_test_script
 
 
 def materialize_meta_tensors(model, tensor_space):
 
@@ -1,4 +1,16 @@
+import dataclasses
+import math
+import os
+
+import networkx
 import pytest
+import pytest_depends
+import pytest_depends.main
+import torch
+from xdist.scheduler import LoadGroupScheduling
+
+# Make fixtures available globally without import
+from tests.common import run_test_script  # isort: skip
 
 
 def pytest_addoption(parser):
@@ -11,13 +23,73 @@ def pytest_addoption(parser):
     )
 
 
+@dataclasses.dataclass
+class WorkerResources:
+    worker_id: int
+    gpu_id: int | None
+    num_gpus: int
+    torchrun_port: int
+    rendezvous_port: int
+
+
+MAX_TEST_MEMORY = 5e9
+CUDA_CONTEXT_SIZE = 7e8
+TORCHRUN_DEFAULT_PORT = 25900
+
+
 def pytest_configure(config):
     config.addinivalue_line("markers", "slow: Test is slow.")
     config.addinivalue_line(
         "markers", "extra_slow: Mark test as extra slow and skip unless --run-extra-slow is given."
     )
+    # TODO: Spawned processes (multi-gpu, Megatron) ignore resource allocation.
+    is_parallel = hasattr(config, "workerinput")
+    if is_parallel:
+        worker_name = config.workerinput["workerid"]
+        assert worker_name.startswith("gw")
+        worker_id = int(worker_name[2:])
+    else:
+        worker_id = 0
+
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 0 and is_parallel:
+        # We spread workers across GPUs.
+        gpu_id = worker_id % num_gpus
+        # We set the device through "CUDA_VISIBLE_DEVICES", and this needs to happen before cuda initialization.
+        # The `device_count` call above doesn't initialize, but `mem_get_info` below does.
+        assert not torch.cuda.is_initialized()
+        # TODO: Support this?
+        assert "CUDA_VISIBLE_DEVICES" not in os.environ
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str((gpu_id + i) % num_gpus) for i in range(num_gpus))
+    elif num_gpus > 0:
+        gpu_id = 0
+    else:
+        gpu_id = None
 
+    gpu_memory = torch.cuda.mem_get_info(0)[1] if num_gpus > 0 else 0
+    if num_gpus > 0:
+        torch.cuda.set_per_process_memory_fraction(MAX_TEST_MEMORY / gpu_memory, 0)
 
+    num_workers = config.workerinput["workercount"] if is_parallel else 1
+    if num_gpus > 0:
+        memory_needed = (MAX_TEST_MEMORY + CUDA_CONTEXT_SIZE) * math.ceil(num_workers / num_gpus)
+        if memory_needed > gpu_memory:
+            raise ValueError(
+                f"Not enough GPU memory to support this many parallel workers {num_workers}."
+                f"Please reduce the number of workers to {int(gpu_memory/(MAX_TEST_MEMORY + CUDA_CONTEXT_SIZE))*num_gpus} or less."
+            )
+
+    config.worker_resources = WorkerResources(
+        worker_id=worker_id,
+        gpu_id=gpu_id,
+        num_gpus=num_gpus,
+        # Each worker needs its own set of ports for safe distributed run. Hopefully these are free.
+        torchrun_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id,
+        rendezvous_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id + 1,
+    )
+
+
+@pytest.hookimpl(trylast=True)
 def pytest_collection_modifyitems(config, items):
     if config.getoption("--skip-slow"):
         skip_slow = pytest.mark.skip(reason="Skipping slow tests")
@@ -29,3 +101,36 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if "extra_slow" in item.keywords:
                 item.add_marker(skip_extra_slow)
+
+    manager: pytest_depends.DependencyManager = pytest_depends.managers[-1]
+    # Build the undirected graph as in `DependencyManager.sorted_items`.
+    dag = networkx.DiGraph()
+    for item in manager.items:
+        node_id = pytest_depends.clean_nodeid(item.nodeid)
+        dag.add_node(node_id)
+        for dependency in manager.dependencies[node_id].dependencies:
+            dag.add_edge(dependency, node_id)
+    # Mark dependency groups for xdist.
+    manager.groups = {}
+    for i, node_ids in enumerate(sorted(networkx.weakly_connected_components(dag), key=len, reverse=True)):
+        if len(node_ids) > 1:
+            for node_id in node_ids:
+                manager.nodeid_to_item[node_id]._nodeid = (
+                    f"{manager.nodeid_to_item[node_id]._nodeid}@dependency_group_{i}"
+                )
+
+    old_clean_nodeid = pytest_depends.main.clean_nodeid
+    # Hack into `clean_nodeid` so pytest_depends recognizes the renamed nodes.
+    pytest_depends.main.clean_nodeid = lambda nodeid: old_clean_nodeid(nodeid.split("@dependency_group_")[0])
+
+
+@pytest.fixture(scope="session")
+def worker_resources(request) -> WorkerResources:
+    return request.config.worker_resources
+
+
+@pytest.mark.trylast
+def pytest_xdist_make_scheduler(config, log):
+    # Always use grouped load balancing to handle dependencies, and make it work with `-n`.
+    assert config.getvalue("dist") == "load"
+    return LoadGroupScheduling(config, log)