diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fe4b06dd263..11596e4a669 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -125,6 +125,7 @@ Guidelines for modifications: * Ziqi Fan * Zoe McCarthy * David Leon +* Song Yi ## Acknowledgements diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 1f18a804ed0..f5d73e7a910 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -46,7 +46,7 @@ specifying the ``--num_workers`` argument for resource-wrapped jobs, or ``--num_ for tuning jobs, which is especially critical for parallel aggregate job processing on local/virtual multi-GPU machines. Tuning jobs assume homogeneous node resource composition for nodes with GPUs. -The two following files contain the core functionality of the Ray integration. +The three following files contain the core functionality of the Ray integration. .. dropdown:: scripts/reinforcement_learning/ray/wrap_resources.py :icon: code @@ -62,6 +62,12 @@ The two following files contain the core functionality of the Ray integration. :language: python :emphasize-lines: 18-53 +.. dropdown:: scripts/reinforcement_learning/ray/task_runner.py + :icon: code + + .. literalinclude:: ../../../scripts/reinforcement_learning/ray/task_runner.py + :language: python + :emphasize-lines: 13-59 The following script can be used to submit aggregate jobs to one or more Ray cluster(s), which can be used for @@ -73,7 +79,7 @@ resource requirements. .. literalinclude:: ../../../scripts/reinforcement_learning/ray/submit_job.py :language: python - :emphasize-lines: 12-53 + :emphasize-lines: 13-61 The following script can be used to extract KubeRay cluster information for aggregate job submission. @@ -151,6 +157,15 @@ Submitting resource-wrapped individual jobs instead of automatic tuning runs is :language: python :emphasize-lines: 14-66 +Supports specifying per-task resources and setting ``py_modules`` and ``pip`` packages for each run. + +.. dropdown:: scripts/reinforcement_learning/ray/task_runner.py + :icon: code + + .. literalinclude:: ../../../scripts/reinforcement_learning/ray/task_runner.py + :language: python + :emphasize-lines: 9-55 + Transferring files from the running container can be done as follows. .. code-block:: bash diff --git a/scripts/reinforcement_learning/ray/submit_job.py b/scripts/reinforcement_learning/ray/submit_job.py index 27c00eda71f..84441eb7638 100644 --- a/scripts/reinforcement_learning/ray/submit_job.py +++ b/scripts/reinforcement_learning/ray/submit_job.py @@ -26,7 +26,11 @@ creates several individual jobs when started on a cluster. Alternatively, an aggregate job could be a :file:'../wrap_resources.py` resource-wrapped job, which may contain several individual sub-jobs separated by -the + delimiter. +the + delimiter. An aggregate job could also be a :file:`../task_runner.py` multi-task submission job, +where each sub-job and its resource requirements are defined in a YAML configuration file. +In this mode, :file:`../task_runner.py` will read the YAML file (via --task_cfg), and +submit all defined sub-tasks to the Ray cluster, supporting per-job resource specification and +real-time streaming of sub-job outputs. If there are more aggregate jobs than cluster(s), aggregate jobs will be submitted as clusters become available via the defined relation above. If there are less aggregate job(s) @@ -48,6 +52,10 @@ # Example: Submitting resource wrapped job python3 scripts/reinforcement_learning/ray/submit_job.py --aggregate_jobs wrap_resources.py --test + # Example: submitting tasks with specific resources, and supporting pip packages and py_modules + # You may use relative paths for task_cfg and py_modules, placing them in the scripts/reinforcement_learning/ray directory, which will be uploaded to the cluster. + python3 scripts/reinforcement_learning/ray/submit_job.py --aggregate_jobs task_runner.py --task_cfg tasks.yaml + # For all command line arguments python3 scripts/reinforcement_learning/ray/submit_job.py -h """ diff --git a/scripts/reinforcement_learning/ray/task_runner.py b/scripts/reinforcement_learning/ray/task_runner.py new file mode 100644 index 00000000000..9bbfda2791f --- /dev/null +++ b/scripts/reinforcement_learning/ray/task_runner.py @@ -0,0 +1,159 @@ +# Copyright (c) 2022-2025, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +import argparse +import sys +import yaml + +import ray +import util + +""" +This script dispatches one or more user-defined Python tasks to workers in a Ray cluster. +Each task, with its resource requirements and execution parameters, is described in a YAML configuration file. +You may specify the desired number of CPUs, GPUs, and memory allocation for each task in the config file. + +Key features: +- Flexible resource management per task via config fields (`num_gpus`, `num_cpus`, `memory`). +- Real-time output streaming (stdout/stderr) for each task. +- Parallel execution of multiple tasks across cluster resources. + +Tasks are distributed and scheduled according to Ray’s built-in resource manager. + +Typical usage: +--------------- + +.. code-block:: bash + + # Print help and argument details: + python task_runner.py -h + + # Submit tasks defined in a YAML file to the Ray cluster (auto-detects Ray head address): + python task_runner.py --task_cfg /path/to/tasks.yaml + +YAML configuration example: +--------------------------- +.. code-block:: yaml + pip: ["xxx"] + py_modules: ["my_package/my_package"] + tasks: + - name: "task1" + py_args: "-m torch.distributed.run --nnodes=1 --nproc_per_node=2 --rdzv_endpoint=localhost:29501 /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task=Isaac-Cartpole-v0 --max_iterations 200 --headless --distributed" + num_gpus: 2 + num_cpus: 10 + memory: 10737418240 + - name: "task2" + py_args: "script.py --option arg" + num_gpus: 0 + num_cpus: 1 + memory: 10*1024*1024*1024 + +- `pip`: List of pip packages to install. +- `py_args`: Arguments passed to the Python executable for this task. +- `num_gpus`, `num_cpus`: Number of GPUs/CPUs to allocate. Can be integer or a string like `"2*2"`. +- `memory`: Amount of memory (bytes) to allocate. Can be integer or a string like `"10*1024*1024*1024"`. + +To stop all tasks early, press Ctrl+C; the script will cancel all running Ray tasks. +""" + + +def parse_args(): + parser = argparse.ArgumentParser(description="Run tasks from a YAML config file.") + parser.add_argument("--task_cfg", type=str, required=True, help="Path to the YAML task file.") + parser.add_argument("--ray_address", type=str, default="auto", help="the Ray address.") + parser.add_argument( + "--test", + action="store_true", + help=( + "Run nvidia-smi test instead of the arbitrary job," + "can use as a sanity check prior to any jobs to check " + "that GPU resources are correctly isolated." + ), + ) + return parser.parse_args() + + +def parse_task_opt(task): + opts = {} + if "num_gpus" in task: + opts["num_gpus"] = eval(task["num_gpus"]) if isinstance(task["num_gpus"], str) else task["num_gpus"] + if "num_cpus" in task: + opts["num_cpus"] = eval(task["num_cpus"]) if isinstance(task["num_cpus"], str) else task["num_cpus"] + if "memory" in task: + opts["memory"] = eval(task["memory"]) if isinstance(task["memory"], str) else task["memory"] + return opts + + +@ray.remote +def remote_execute_job(job_cmd: str, identifier_string: str, test_mode: bool) -> str | dict: + return util.execute_job( + job_cmd=job_cmd, + identifier_string=identifier_string, + test_mode=test_mode, + log_all_output=True, # make log_all_output=True to check output in real time + ) + + +def run_tasks(ray_address, pip, py_modules, tasks, test_mode=False): + if not tasks: + print("[WARNING]: no tasks to submit") + return + + if not ray.is_initialized(): + try: + ray.init( + address=ray_address, + log_to_driver=True, + runtime_env={ + "pip": pip, + "py_modules": py_modules, + }, + ) + except Exception as e: + raise RuntimeError(f"initialize ray failed: {str(e)}") + task_results = [] + for task in tasks: + opts = parse_task_opt(task) + task_cmd = " ".join([sys.executable, *task["py_args"].split()]) + print(f"[INFO] submitting task {task['name']} with opts={opts}: {task_cmd}") + task_results.append(remote_execute_job.options(**opts).remote(task_cmd, task["name"], test_mode)) + + try: + results = ray.get(task_results) + for i, result in enumerate(results): + print(f"[INFO]: Task {tasks[i]['name']} result: \n{result}") + print("[INFO]: all tasks completed.") + except KeyboardInterrupt: + print("[INFO]: dealing with keyboard interrupt") + for future in task_results: + ray.cancel(future, force=True) + print("[INFO]: all tasks cancelled.") + sys.exit(1) + except Exception as e: + print(f"[ERROR]: error while running tasks: {str(e)}") + raise e + + +def main(): + args = parse_args() + try: + with open(args.task_cfg) as f: + config = yaml.safe_load(f) + except Exception as e: + raise SystemExit(f"error while loading task config: {str(e)}") + tasks = config["tasks"] + py_modules = config.get("py_modules") + pip = config.get("pip") + run_tasks( + ray_address=args.ray_address, + pip=pip, + py_modules=py_modules, + tasks=tasks, + test_mode=args.test, + ) + + +if __name__ == "__main__": + main()