From a37e5f504cebd7457ee512c022e541974d2d7629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=BE=E7=BF=8A?= Date: Wed, 2 Jul 2025 17:16:49 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20feat(Ray):=20Enhance=20Ray?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add task_runner.py to support specifying resources, py_modules, and pip. https://github.com/isaac-sim/IsaacLab/issues/2632 --- CONTRIBUTORS.md | 1 + docs/source/features/ray.rst | 19 +- .../reinforcement_learning/ray/submit_job.py | 9 +- .../reinforcement_learning/ray/task_runner.py | 176 ++++++++++++++++++ 4 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 scripts/reinforcement_learning/ray/task_runner.py diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fe4b06dd263..11596e4a669 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -125,6 +125,7 @@ Guidelines for modifications: * Ziqi Fan * Zoe McCarthy * David Leon +* Song Yi ## Acknowledgements diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index 1f18a804ed0..a8b61cd0c31 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -46,7 +46,7 @@ specifying the ``--num_workers`` argument for resource-wrapped jobs, or ``--num_ for tuning jobs, which is especially critical for parallel aggregate job processing on local/virtual multi-GPU machines. Tuning jobs assume homogeneous node resource composition for nodes with GPUs. -The two following files contain the core functionality of the Ray integration. +The three following files contain the core functionality of the Ray integration. .. dropdown:: scripts/reinforcement_learning/ray/wrap_resources.py :icon: code @@ -62,6 +62,12 @@ The two following files contain the core functionality of the Ray integration. :language: python :emphasize-lines: 18-53 +.. dropdown:: scripts/reinforcement_learning/ray/task_runner.py + :icon: code + + .. literalinclude:: ../../../scripts/reinforcement_learning/ray/task_runner.py + :language: python + :emphasize-lines: 9-55 The following script can be used to submit aggregate jobs to one or more Ray cluster(s), which can be used for @@ -73,7 +79,7 @@ resource requirements. .. literalinclude:: ../../../scripts/reinforcement_learning/ray/submit_job.py :language: python - :emphasize-lines: 12-53 + :emphasize-lines: 13-59 The following script can be used to extract KubeRay cluster information for aggregate job submission. @@ -151,6 +157,15 @@ Submitting resource-wrapped individual jobs instead of automatic tuning runs is :language: python :emphasize-lines: 14-66 +Supports specifying per-task resources and setting ``py_modules`` and ``pip`` packages for each run. + +.. dropdown:: scripts/reinforcement_learning/ray/task_runner.py + :icon: code + + .. literalinclude:: ../../../scripts/reinforcement_learning/ray/task_runner.py + :language: python + :emphasize-lines: 9-55 + Transferring files from the running container can be done as follows. .. code-block:: bash diff --git a/scripts/reinforcement_learning/ray/submit_job.py b/scripts/reinforcement_learning/ray/submit_job.py index 27c00eda71f..8d0649ebf8a 100644 --- a/scripts/reinforcement_learning/ray/submit_job.py +++ b/scripts/reinforcement_learning/ray/submit_job.py @@ -26,7 +26,11 @@ creates several individual jobs when started on a cluster. Alternatively, an aggregate job could be a :file:'../wrap_resources.py` resource-wrapped job, which may contain several individual sub-jobs separated by -the + delimiter. +the + delimiter. An aggregate job could also be a :file:`../task_runner.py` multi-task submission job, +where each sub-job and its resource requirements are defined in a YAML configuration file. +In this mode, :file:`../task_runner.py` will read the YAML file (via --task_cfg), and +submit all defined sub-tasks to the Ray cluster, supporting per-job resource specification and +real-time streaming of sub-job outputs. If there are more aggregate jobs than cluster(s), aggregate jobs will be submitted as clusters become available via the defined relation above. If there are less aggregate job(s) @@ -48,6 +52,9 @@ # Example: Submitting resource wrapped job python3 scripts/reinforcement_learning/ray/submit_job.py --aggregate_jobs wrap_resources.py --test + # Example: submitting tasks with specific resources, and supporting pip packages and py_modules + python3 scripts/reinforcement_learning/ray/submit_job.py --aggregate_jobs task_runner.py --task_cfg tasks.yaml + # For all command line arguments python3 scripts/reinforcement_learning/ray/submit_job.py -h """ diff --git a/scripts/reinforcement_learning/ray/task_runner.py b/scripts/reinforcement_learning/ray/task_runner.py new file mode 100644 index 00000000000..044b0ab9147 --- /dev/null +++ b/scripts/reinforcement_learning/ray/task_runner.py @@ -0,0 +1,176 @@ +import yaml +import ray +import sys +import argparse +import subprocess +import threading +from enum import Enum + +""" +This script dispatches one or more user-defined Python tasks to workers in a Ray cluster. +Each task, with its resource requirements and execution parameters, is described in a YAML configuration file. +You may specify the desired number of CPUs, GPUs, and memory allocation for each task in the config file. + +Key features: +- Flexible resource management per task via config fields (`num_gpus`, `num_cpus`, `memory`). +- Real-time output streaming (stdout/stderr) for each task. +- Parallel execution of multiple tasks across cluster resources. + +Tasks are distributed and scheduled according to Ray’s built-in resource manager. + +Typical usage: +--------------- + +.. code-block:: bash + + # Print help and argument details: + python task_runner.py -h + + # Submit tasks defined in a YAML file to the Ray cluster (auto-detects Ray head address): + python task_runner.py --task_cfg /path/to/tasks.yaml + +YAML configuration example: +--------------------------- +.. code-block:: yaml + pip: ["xxx"] + py_modules: ["my_package/my_package"] + tasks: + - name: "task1" + py_args: "-m torch.distributed.run --nnodes=1 ..." + num_gpus: 2 + num_cpus: 10 + memory: 10737418240 + - name: "task2" + py_args: "script.py --option arg" + num_gpus: 0 + num_cpus: 1 + memory: 10*1024*1024*1024 + +- `pip`: List of pip packages to install. +- `py_args`: Arguments passed to the Python executable for this task. +- `num_gpus`, `num_cpus`: Number of GPUs/CPUs to allocate. Can be integer or a string like `"2*2"`. +- `memory`: Amount of memory (bytes) to allocate. Can be integer or a string like `"10*1024*1024*1024"`. + +To stop all tasks early, press Ctrl+C; the script will cancel all running Ray tasks. +""" + +class OutputType(str, Enum): + STDOUT = "stdout" + STDERR = "stderr" + +def parse_args(): + parser = argparse.ArgumentParser(description="Run tasks from a YAML config file.") + parser.add_argument("--task_cfg", type=str, required=True, help="Path to the YAML task file.") + parser.add_argument("--ray_address", type=str, default="auto", help="the Ray address.") + return parser.parse_args() + +@ray.remote +def task_wrapper(task): + task_name = task["name"] + task_py_args = task["py_args"] + + # build command + cmd = [sys.executable, *task_py_args.split()] + print(f"[INFO]: {task_name} run: {' '.join(cmd)}") + def handle_stream(stream, output_type): + for line in iter(stream.readline, ''): + stripped_line = line.rstrip('\n') + if output_type == OutputType.STDOUT: + print(stripped_line) + elif output_type == OutputType.STDERR: + print(stripped_line, file=sys.stderr) + stream.close() + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1 # None for best performance and 1 for realtime output + ) + + # start tow threads to read stdout and stderr + stdout_thread = threading.Thread( + target=handle_stream, args=(process.stdout, OutputType.STDOUT) + ) + stderr_thread = threading.Thread( + target=handle_stream, args=(process.stderr, OutputType.STDERR) + ) + stdout_thread.start() + stderr_thread.start() + # wait for process to finish + process.wait() + # wait for threads to finish + stdout_thread.join() + stderr_thread.join() + + returncode = process.returncode + except Exception as e: + print(f"[ERROR]: error while running task {task_name}: {str(e)}" ) + raise e + + print(f"[INFO]: task {task_name} finished with return code {returncode}") + return True + + +def submit_tasks(ray_address,pip,py_modules,tasks): + if not tasks: + print("[WARNING]: no tasks to submit") + return + + if not ray.is_initialized(): + try: + ray.init(address=ray_address, log_to_driver=True, runtime_env={ + "pip": pip, + "py_modules": py_modules, + }) + except Exception as e: + raise RuntimeError(f"initialize ray failed: {str(e)}") + task_results = [] + for task in tasks: + num_gpus = eval(task["num_gpus"]) if isinstance(task["num_gpus"], str) else task["num_gpus"] + num_cpus = eval(task["num_cpus"]) if isinstance(task["num_cpus"], str) else task["num_cpus"] + memory = eval(task["memory"]) if isinstance(task["memory"], str) else task["memory"] + print(f"[INFO]: submitting task {task['name']} with num_gpus={num_gpus}, num_cpus={num_cpus}, memory={memory}") + task_results.append(task_wrapper.options( + num_gpus=num_gpus, + num_cpus=num_cpus, + memory=memory, + ).remote(task)) + + try: + results = ray.get(task_results) + for i, _ in enumerate(results): + print(f"[INFO]: Task {tasks[i]['name']} finished") + print("[INFO]: all tasks completed.") + except KeyboardInterrupt: + print("[INFO]: dealing with keyboard interrupt") + for future in task_results: + ray.cancel(future,force=True) + print("[INFO]: all tasks cancelled.") + sys.exit(1) + except Exception as e: + print(f"[ERROR]: error while running tasks: {str(e)}") + raise e + + +def main(): + args = parse_args() + try: + with open(args.task_cfg, 'r') as f: + config = yaml.safe_load(f) + except Exception as e: + raise SystemExit(f"error while loading task config: {str(e)}") + tasks = config["tasks"] + py_modules = config.get("py_modules",None) + pip = config.get("pip",None) + submit_tasks( + ray_address=args.ray_address, + pip=pip, + py_modules=py_modules, + tasks=tasks, + ) + +if __name__ == "__main__": + main() + From 46ca9430ec5b2cdcf5c3888c97318cc09e7f05b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=BE=E7=BF=8A?= Date: Tue, 8 Jul 2025 17:59:33 +0800 Subject: [PATCH 2/3] refactor(Ray): Use execute_job and improve code structure --- .../reinforcement_learning/ray/task_runner.py | 153 ++++++++---------- 1 file changed, 68 insertions(+), 85 deletions(-) diff --git a/scripts/reinforcement_learning/ray/task_runner.py b/scripts/reinforcement_learning/ray/task_runner.py index 044b0ab9147..9bbfda2791f 100644 --- a/scripts/reinforcement_learning/ray/task_runner.py +++ b/scripts/reinforcement_learning/ray/task_runner.py @@ -1,10 +1,14 @@ +# Copyright (c) 2022-2025, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +import argparse +import sys import yaml + import ray -import sys -import argparse -import subprocess -import threading -from enum import Enum +import util """ This script dispatches one or more user-defined Python tasks to workers in a Ray cluster. @@ -36,7 +40,7 @@ py_modules: ["my_package/my_package"] tasks: - name: "task1" - py_args: "-m torch.distributed.run --nnodes=1 ..." + py_args: "-m torch.distributed.run --nnodes=1 --nproc_per_node=2 --rdzv_endpoint=localhost:29501 /workspace/isaaclab/scripts/reinforcement_learning/rsl_rl/train.py --task=Isaac-Cartpole-v0 --max_iterations 200 --headless --distributed" num_gpus: 2 num_cpus: 10 memory: 10737418240 @@ -54,99 +58,77 @@ To stop all tasks early, press Ctrl+C; the script will cancel all running Ray tasks. """ -class OutputType(str, Enum): - STDOUT = "stdout" - STDERR = "stderr" def parse_args(): parser = argparse.ArgumentParser(description="Run tasks from a YAML config file.") parser.add_argument("--task_cfg", type=str, required=True, help="Path to the YAML task file.") parser.add_argument("--ray_address", type=str, default="auto", help="the Ray address.") + parser.add_argument( + "--test", + action="store_true", + help=( + "Run nvidia-smi test instead of the arbitrary job," + "can use as a sanity check prior to any jobs to check " + "that GPU resources are correctly isolated." + ), + ) return parser.parse_args() - -@ray.remote -def task_wrapper(task): - task_name = task["name"] - task_py_args = task["py_args"] - - # build command - cmd = [sys.executable, *task_py_args.split()] - print(f"[INFO]: {task_name} run: {' '.join(cmd)}") - def handle_stream(stream, output_type): - for line in iter(stream.readline, ''): - stripped_line = line.rstrip('\n') - if output_type == OutputType.STDOUT: - print(stripped_line) - elif output_type == OutputType.STDERR: - print(stripped_line, file=sys.stderr) - stream.close() - try: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - bufsize=1 # None for best performance and 1 for realtime output - ) - - # start tow threads to read stdout and stderr - stdout_thread = threading.Thread( - target=handle_stream, args=(process.stdout, OutputType.STDOUT) - ) - stderr_thread = threading.Thread( - target=handle_stream, args=(process.stderr, OutputType.STDERR) - ) - stdout_thread.start() - stderr_thread.start() - # wait for process to finish - process.wait() - # wait for threads to finish - stdout_thread.join() - stderr_thread.join() - - returncode = process.returncode - except Exception as e: - print(f"[ERROR]: error while running task {task_name}: {str(e)}" ) - raise e - print(f"[INFO]: task {task_name} finished with return code {returncode}") - return True + +def parse_task_opt(task): + opts = {} + if "num_gpus" in task: + opts["num_gpus"] = eval(task["num_gpus"]) if isinstance(task["num_gpus"], str) else task["num_gpus"] + if "num_cpus" in task: + opts["num_cpus"] = eval(task["num_cpus"]) if isinstance(task["num_cpus"], str) else task["num_cpus"] + if "memory" in task: + opts["memory"] = eval(task["memory"]) if isinstance(task["memory"], str) else task["memory"] + return opts + + +@ray.remote +def remote_execute_job(job_cmd: str, identifier_string: str, test_mode: bool) -> str | dict: + return util.execute_job( + job_cmd=job_cmd, + identifier_string=identifier_string, + test_mode=test_mode, + log_all_output=True, # make log_all_output=True to check output in real time + ) -def submit_tasks(ray_address,pip,py_modules,tasks): +def run_tasks(ray_address, pip, py_modules, tasks, test_mode=False): if not tasks: print("[WARNING]: no tasks to submit") return if not ray.is_initialized(): try: - ray.init(address=ray_address, log_to_driver=True, runtime_env={ - "pip": pip, - "py_modules": py_modules, - }) + ray.init( + address=ray_address, + log_to_driver=True, + runtime_env={ + "pip": pip, + "py_modules": py_modules, + }, + ) except Exception as e: raise RuntimeError(f"initialize ray failed: {str(e)}") task_results = [] - for task in tasks: - num_gpus = eval(task["num_gpus"]) if isinstance(task["num_gpus"], str) else task["num_gpus"] - num_cpus = eval(task["num_cpus"]) if isinstance(task["num_cpus"], str) else task["num_cpus"] - memory = eval(task["memory"]) if isinstance(task["memory"], str) else task["memory"] - print(f"[INFO]: submitting task {task['name']} with num_gpus={num_gpus}, num_cpus={num_cpus}, memory={memory}") - task_results.append(task_wrapper.options( - num_gpus=num_gpus, - num_cpus=num_cpus, - memory=memory, - ).remote(task)) - + for task in tasks: + opts = parse_task_opt(task) + task_cmd = " ".join([sys.executable, *task["py_args"].split()]) + print(f"[INFO] submitting task {task['name']} with opts={opts}: {task_cmd}") + task_results.append(remote_execute_job.options(**opts).remote(task_cmd, task["name"], test_mode)) + try: results = ray.get(task_results) - for i, _ in enumerate(results): - print(f"[INFO]: Task {tasks[i]['name']} finished") + for i, result in enumerate(results): + print(f"[INFO]: Task {tasks[i]['name']} result: \n{result}") print("[INFO]: all tasks completed.") except KeyboardInterrupt: print("[INFO]: dealing with keyboard interrupt") for future in task_results: - ray.cancel(future,force=True) + ray.cancel(future, force=True) print("[INFO]: all tasks cancelled.") sys.exit(1) except Exception as e: @@ -157,20 +139,21 @@ def submit_tasks(ray_address,pip,py_modules,tasks): def main(): args = parse_args() try: - with open(args.task_cfg, 'r') as f: + with open(args.task_cfg) as f: config = yaml.safe_load(f) except Exception as e: raise SystemExit(f"error while loading task config: {str(e)}") tasks = config["tasks"] - py_modules = config.get("py_modules",None) - pip = config.get("pip",None) - submit_tasks( - ray_address=args.ray_address, - pip=pip, - py_modules=py_modules, - tasks=tasks, - ) + py_modules = config.get("py_modules") + pip = config.get("pip") + run_tasks( + ray_address=args.ray_address, + pip=pip, + py_modules=py_modules, + tasks=tasks, + test_mode=args.test, + ) + if __name__ == "__main__": main() - From 3a7a349c0527cb024cd9f9e56ef7a6ce8fb82d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=BE=E7=BF=8A?= Date: Tue, 8 Jul 2025 18:01:06 +0800 Subject: [PATCH 3/3] docs(Ray): Update documentation about task_runner.py --- docs/source/features/ray.rst | 4 ++-- scripts/reinforcement_learning/ray/submit_job.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/features/ray.rst b/docs/source/features/ray.rst index a8b61cd0c31..f5d73e7a910 100644 --- a/docs/source/features/ray.rst +++ b/docs/source/features/ray.rst @@ -67,7 +67,7 @@ The three following files contain the core functionality of the Ray integration. .. literalinclude:: ../../../scripts/reinforcement_learning/ray/task_runner.py :language: python - :emphasize-lines: 9-55 + :emphasize-lines: 13-59 The following script can be used to submit aggregate jobs to one or more Ray cluster(s), which can be used for @@ -79,7 +79,7 @@ resource requirements. .. literalinclude:: ../../../scripts/reinforcement_learning/ray/submit_job.py :language: python - :emphasize-lines: 13-59 + :emphasize-lines: 13-61 The following script can be used to extract KubeRay cluster information for aggregate job submission. diff --git a/scripts/reinforcement_learning/ray/submit_job.py b/scripts/reinforcement_learning/ray/submit_job.py index 8d0649ebf8a..84441eb7638 100644 --- a/scripts/reinforcement_learning/ray/submit_job.py +++ b/scripts/reinforcement_learning/ray/submit_job.py @@ -53,6 +53,7 @@ python3 scripts/reinforcement_learning/ray/submit_job.py --aggregate_jobs wrap_resources.py --test # Example: submitting tasks with specific resources, and supporting pip packages and py_modules + # You may use relative paths for task_cfg and py_modules, placing them in the scripts/reinforcement_learning/ray directory, which will be uploaded to the cluster. python3 scripts/reinforcement_learning/ray/submit_job.py --aggregate_jobs task_runner.py --task_cfg tasks.yaml # For all command line arguments